diff --git a/.gitmodules b/.gitmodules
index bc771d8c6ee..e69de29bb2d 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +0,0 @@
-[submodule "doc/notebook"]
-	path = doc/notebook
-	url = https://github.com/espnet/notebook
diff --git a/README.md b/README.md
index 3ab5e55649c..22cffb798be 100644
--- a/README.md
+++ b/README.md
@@ -78,6 +78,10 @@ ESPnet uses [pytorch](http://pytorch.org/) as a deep learning engine and also fo
   - Set `frontend` to be `s3prl`
   - Select any upstream model by setting the `frontend_conf` to the corresponding name.
 - Streaming Transformer/Conformer ASR with blockwise synchronous beam search.
+- Restricted Self-Attention based on [Longformer](https://arxiv.org/abs/2004.05150) as an encoder for long sequences 
+
+### SUM: Speech Summarization
+- End to End Speech Summarization Recipe for Instructional Videos using Restricted Self-Attention [[Sharma et al., 2022]](https://arxiv.org/abs/2110.06263)
 
 Demonstration
 - Real-time ASR demo with ESPnet2  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/espnet/notebook/blob/master/espnet2_asr_realtime_demo.ipynb)
@@ -129,7 +133,7 @@ To train the neural vocoder, please check the following repositories:
 - Multi-speaker speech separation
 - Unified encoder-separator-decoder structure for time-domain and frequency-domain models
   - Encoder/Decoder: STFT/iSTFT, Convolution/Transposed-Convolution
-  - Separators: BLSTM, Transformer, Conformer, DPRNN, [DCCRN](https://arxiv.org/abs/2008.00264), [Deep Clustering](https://ieeexplore.ieee.org/document/7471631), [Deep Attractor Network](https://pubmed.ncbi.nlm.nih.gov/29430212/), Neural Beamformers, etc.
+  - Separators: BLSTM, Transformer, Conformer, [TasNet](https://arxiv.org/abs/1809.07454), [DPRNN](https://arxiv.org/abs/1910.06379), [DC-CRN](https://web.cse.ohio-state.edu/~wang.77/papers/TZW.taslp21.pdf), [DCCRN](https://arxiv.org/abs/2008.00264), [Deep Clustering](https://ieeexplore.ieee.org/document/7471631), [Deep Attractor Network](https://pubmed.ncbi.nlm.nih.gov/29430212/), Neural Beamformers, etc.
 - Flexible ASR integration: working as an individual task or as the ASR frontend
 - Easy to import pretrained models from [Asteroid](https://github.com/asteroid-team/asteroid)
   - Both the pre-trained models from Asteroid and the specific configuration are supported.
diff --git a/ci/doc.sh b/ci/doc.sh
index cbcd78f4b21..114bc92b952 100755
--- a/ci/doc.sh
+++ b/ci/doc.sh
@@ -26,6 +26,8 @@ set -euo pipefail
 find ./utils/{*.sh,spm_*} -exec ./doc/usage2rst.sh {} \; | tee ./doc/_gen/utils_sh.rst
 find ./espnet2/bin/*.py -exec ./doc/usage2rst.sh {} \; | tee ./doc/_gen/espnet2_bin.rst
 
+./doc/notebook2rst.sh > ./doc/_gen/notebooks.rst
+
 # generate package doc
 ./doc/module2rst.py --root espnet espnet2 --dst ./doc --exclude espnet.bin
 
diff --git a/ci/install.sh b/ci/install.sh
index eeb531d7ddd..5bfed7584ad 100755
--- a/ci/install.sh
+++ b/ci/install.sh
@@ -21,7 +21,7 @@ ${CXX:-g++} -v
     . ./activate_python.sh
     make TH_VERSION="${TH_VERSION}"
 
-    make warp-ctc.done warp-transducer.done chainer_ctc.done nkf.done moses.done mwerSegmenter.done pesq pyopenjtalk.done py3mmseg.done s3prl.done transformers.done phonemizer.done fairseq.done k2.done gtn.done
+    make warp-ctc.done warp-transducer.done chainer_ctc.done nkf.done moses.done mwerSegmenter.done pesq pyopenjtalk.done py3mmseg.done s3prl.done transformers.done phonemizer.done fairseq.done k2.done gtn.done longformer.done
     rm -rf kaldi
 )
 . tools/activate_python.sh
diff --git a/doc/.gitignore b/doc/.gitignore
index d4058a5aa91..79f7202744d 100644
--- a/doc/.gitignore
+++ b/doc/.gitignore
@@ -1,4 +1,4 @@
 _gen/
 _build/
 build/
-
+notebook/
\ No newline at end of file
diff --git a/doc/index.rst b/doc/index.rst
index 13f20ab0a96..30cd3d35fd4 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -28,16 +28,7 @@ ESPnet is an end-to-end speech processing toolkit, mainly focuses on end-to-end
    ./espnet2_task.md
    ./espnet2_distributed.md
 
-.. toctree::
-   :maxdepth: 1
-   :caption: Notebook:
-
-   ./notebook/asr_cli.ipynb
-   ./notebook/asr_library.ipynb
-   ./notebook/tts_cli.ipynb
-   ./notebook/pretrained.ipynb
-   ./notebook/tts_realtime_demo.ipynb
-   ./notebook/st_demo.ipynb
+.. include:: ./_gen/notebooks.rst
 
 .. include:: ./_gen/modules.rst
 
diff --git a/doc/installation.md b/doc/installation.md
index 0a1c8acf022..db45a09135b 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -32,14 +32,14 @@ the following packages are installed using Anaconda, so you can skip them.)
     # For CentOS
     $ sudo yum install libsndfile
     ```
-- ffmpeg (This is not required when installataion, but used in some recipes)
+- ffmpeg (This is not required when installing, but used in some recipes)
     ```sh
     # For Ubuntu
     $ sudo apt-get install ffmpeg
     # For CentOS
     $ sudo yum install ffmpeg
     ```
-- flac (This is not required when installataion, but used in some recipes)
+- flac (This is not required when installing, but used in some recipes)
     ```sh
     # For Ubuntu
     $ sudo apt-get install flac
diff --git a/doc/notebook b/doc/notebook
deleted file mode 160000
index ef3cbf880fc..00000000000
--- a/doc/notebook
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit ef3cbf880fcd725d11021e541a0cdfae4080446d
diff --git a/doc/notebook2rst.sh b/doc/notebook2rst.sh
new file mode 100755
index 00000000000..83bf7d57794
--- /dev/null
+++ b/doc/notebook2rst.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+cd "$(dirname "$0")"
+
+if [ ! -d notebook ]; then
+    git clone https://github.com/espnet/notebook --depth 1
+fi
+
+echo "\
+.. toctree::
+   :maxdepth: 1
+   :caption: Notebook:
+"
+
+find ./notebook/*.ipynb -exec echo "   {}" \;
diff --git a/egs2/README.md b/egs2/README.md
index dcbd80bf5b9..8da8f300214 100755
--- a/egs2/README.md
+++ b/egs2/README.md
@@ -8,39 +8,40 @@ See: https://espnet.github.io/espnet/espnet2_tutorial.html#recipes-using-espnet2
 
 | Directory name          | Corpus name                                                                             | Task                    | Language              | URL                                                                                                          | Note         |
 | ----------------------- | --------------------------------------------------------------------------------------- | ----------------------- | --------------------- | ------------------------------------------------------------------------------------------------------------ | ------------ |
-| aidatatang_200zh        | Aidatatang_200zh A free Chinese Mandarin speech corpus                                  | ASR                     | CMN                  | http://www.openslr.org/resources/62                                                                          |              |
-| aishell                 | AISHELL-ASR0009-OS1 Open Source Mandarin Speech Corpus                                  | ASR                     | CMN                  | http://www.aishelltech.com/kysjcp                                                                            |              |
-| aishell3                | AISHELL3 Mandarin multi-speaker text-to-speech                                          | TTS                     | CMN                  | https://www.openslr.org/93/                                                                                  |              |
-| ami                     | The AMI Meeting Corpus                                                                  | ASR                     | ENG                  | http://groups.inf.ed.ac.uk/ami/corpus/                                                                       |              |
-| an4                     | CMU AN4 database                                                                        | ASR/TTS                 | ENG                 | http://www.speech.cs.cmu.edu/databases/an4/                                                                  |              |
-| babel                   | IARPA Babel corups                                                                      | ASR                     | ~20 languages        | https://www.iarpa.gov/index.php/research-programs/babel                                                      |              |
-| bn_openslr53            | Large bengali ASR training dataset                                                      | ASR                     | BEN                  | https://openslr.org/53/                                                                                      |              |
+| aidatatang_200zh        | Aidatatang_200zh A free Chinese Mandarin speech corpus                                  | ASR                     | CMN                   | http://www.openslr.org/resources/62                                                                          |              |
+| aishell                 | AISHELL-ASR0009-OS1 Open Source Mandarin Speech Corpus                                  | ASR                     | CMN                   | http://www.aishelltech.com/kysjcp                                                                            |              |
+| aishell3                | AISHELL3 Mandarin multi-speaker text-to-speech                                          | TTS                     | CMN                   | https://www.openslr.org/93/                                                                                  |              |
+| ami                     | The AMI Meeting Corpus                                                                  | ASR                     | ENG                   | http://groups.inf.ed.ac.uk/ami/corpus/                                                                       |              |
+| an4                     | CMU AN4 database                                                                        | ASR/TTS                 | ENG                   | http://www.speech.cs.cmu.edu/databases/an4/                                                                  |              |
+| babel                   | IARPA Babel corups                                                                      | ASR                     | ~20 languages         | https://www.iarpa.gov/index.php/research-programs/babel                                                      |              |
+| bn_openslr53            | Large bengali ASR training dataset                                                      | ASR                     | BEN                   | https://openslr.org/53/                                                                                      |              |
 | catslu               	  | CATSLU-MAPS                                                                             | SLU                     | CMN           	      | https://sites.google.com/view/catslu/home                                                                    |              |
-| chime4                  | The 4th CHiME Speech Separation and Recognition Challenge                               | ASR/Multichannel ASR    | ENG                  | http://spandh.dcs.shef.ac.uk/chime_challenge/chime2016/                                                      |              |
-| cmu_indic               | CMU INDIC                                                                               | TTS                     | 7 languages          | http://festvox.org/cmu_indic/                                                                                |              |
+| chime4                  | The 4th CHiME Speech Separation and Recognition Challenge                               | ASR/Multichannel ASR    | ENG                   | http://spandh.dcs.shef.ac.uk/chime_challenge/chime2016/                                                      |              |
+| cmu_indic               | CMU INDIC                                                                               | TTS                     | 7 languages           | http://festvox.org/cmu_indic/                                                                                |              |
 | commonvoice             | The Mozilla Common Voice                                                                | ASR                     | 13 languages          | https://voice.mozilla.org/datasets                                                                           |              |
-| csj                     | Corpus of Spontaneous Japanese                                                          | ASR                     | JPN                  | https://pj.ninjal.ac.jp/corpus_center/csj/en/                                                                |              |
-| csmsc                   | Chinese Standard Mandarin Speech Copus                                                  | TTS                     | CMN                  | https://www.data-baker.com/open_source.html                                                                  |              |
+| csj                     | Corpus of Spontaneous Japanese                                                          | ASR                     | JPN                   | https://pj.ninjal.ac.jp/corpus_center/csj/en/                                                                |              |
+| csmsc                   | Chinese Standard Mandarin Speech Copus                                                  | TTS                     | CMN                   | https://www.data-baker.com/open_source.html                                                                  |              |
 | css10                   | CSS10: A Collection of Single Speaker Speech Datasets for 10 Languages                  | TTS                     | 10 langauges          | https://github.com/Kyubyong/css10                                                                            |              |
-| dirha_wsj               | Distant-speech Interaction for Robust Home Applications                                 | Multichannel ASR        | ENG                  | https://dirha.fbk.eu/, https://github.com/SHINE-FBK/DIRHA_English_wsj                                        |              |
-| dns_ins20               | Deep Noise Suppression Challenge – INTERSPEECH 2020                                 | SE                      | 7 languages + singing | https://www.microsoft.com/en-us/research/academic-program/deep-noise-suppression-challenge-interspeech-2020/ |              |
-| fisher_callhome_spanish | Fisher and CALLHOME Spanish--English Speech Translation                                 | ASR/ST                  | SPA->ENG             | https://catalog.ldc.upenn.edu/LDC2014T23                                                                     |              |
-| fsc                     | Fluent Speech Commands Dataset                                                          | SLU                     | ENG                  | https://fluent.ai/fluent-speech-commands-a-dataset-for-spoken-language-understanding-research/               |              |
-| fsc_unseen              | Fluent Speech Commands Dataset MASE Eval Unseen splits                                         | SLU                     | ENG                    | https://github.com/maseEval/mase                                                                      |              |
-| fsc_challenge           | Fluent Speech Commands Dataset MASE Eval Challenge splits                                         | SLU                     | ENG                 | https://github.com/maseEval/mase                                                                   |              |
-| gigaspeech              | GigaSpeech: An Evolving, Multi-domain ASR Corpus with 10,000 Hours of Transcribed Audio | ASR                     | ENG                  | https://github.com/SpeechColab/GigaSpeech                                                                    |              |
-| grabo                   | Grabo dataset                                                                           | SLU                     | ENG + NLD             | https://www.esat.kuleuven.be/psi/spraak/downloads/                                                           |               |
-| hkust                   | HKUST/MTS: A very large scale Mandarin telephone speech corpus                          | ASR                     | CMN                  | https://catalog.ldc.upenn.edu/LDC2005S15                                                                     |              |
-| hui_acg                 | HUI-audio-corpus-german                                                                 | TTS                     | DEU                  | https://opendata.iisys.de/datasets.html#hui-audio-corpus-german                                              |              |
+| dirha_wsj               | Distant-speech Interaction for Robust Home Applications                                 | Multichannel ASR        | ENG                   | https://dirha.fbk.eu/, https://github.com/SHINE-FBK/DIRHA_English_wsj                                        |              |
+| dns_ins20               | Deep Noise Suppression Challenge – INTERSPEECH 2020                                     | SE                      | 7 languages + singing | https://www.microsoft.com/en-us/research/academic-program/deep-noise-suppression-challenge-interspeech-2020/ |              |
+| dsing                   | Automatic Lyric Transcription from Karaoke Vocal Tracks (From DAMP Sing300x30x2)        | ASR (ALT)               | ENG singing           | https://github.com/groadabike/Kaldi-Dsing-task                                                               |              |
+| fisher_callhome_spanish | Fisher and CALLHOME Spanish--English Speech Translation                                 | ASR/ST                  | SPA->ENG              | https://catalog.ldc.upenn.edu/LDC2014T23                                                                     |              |
+| fsc                     | Fluent Speech Commands Dataset                                                          | SLU                     | ENG                   | https://fluent.ai/fluent-speech-commands-a-dataset-for-spoken-language-understanding-research/               |              |
+| fsc_unseen              | Fluent Speech Commands Dataset MASE Eval Unseen splits                                  | SLU                     | ENG                   | https://github.com/maseEval/mase                                                                             |              |
+| fsc_challenge           | Fluent Speech Commands Dataset MASE Eval Challenge splits                               | SLU                     | ENG                   | https://github.com/maseEval/mase                                                                             |              |
+| gigaspeech              | GigaSpeech: An Evolving, Multi-domain ASR Corpus with 10,000 Hours of Transcribed Audio | ASR                     | ENG                   | https://github.com/SpeechColab/GigaSpeech                                                                    |              |
+| grabo                   | Grabo dataset                                                                           | SLU                     | ENG + NLD             | https://www.esat.kuleuven.be/psi/spraak/downloads/                                                           |              |
+| hkust                   | HKUST/MTS: A very large scale Mandarin telephone speech corpus                          | ASR                     | CMN                   | https://catalog.ldc.upenn.edu/LDC2005S15                                                                     |              |
+| hui_acg                 | HUI-audio-corpus-german                                                                 | TTS                     | DEU                   | https://opendata.iisys.de/datasets.html#hui-audio-corpus-german                                              |              |
 | how2                    | How2: A Large-scale Dataset for Multimodal Language Understanding                       | ASR/MT/ST               | ENG->POR              | https://github.com/srvk/how2-dataset                                                                         |              |
-| iemocap                 | IEMOCAP database: The Interactive Emotional Dyadic Motion Capture database              | SLU                     | ENG                  | https://sail.usc.edu/iemocap/                                                                                |              |
-| iwslt21_low_resource    | ALFFA, IARPA Babel, Gamayun, IWSLT 2021                                                 | ASR                     | SWA                  | http://www.openslr.org/25/ https://catalog.ldc.upenn.edu/LDC2017S05 https://gamayun.translatorswb.org/data/ https://iwslt.org/2021/low-resource |              |
+| iemocap                 | IEMOCAP database: The Interactive Emotional Dyadic Motion Capture database              | SLU                     | ENG                   | https://sail.usc.edu/iemocap/                                                                                |              |
+| iwslt21_low_resource    | ALFFA, IARPA Babel, Gamayun, IWSLT 2021                                                 | ASR                     | SWA                   | http://www.openslr.org/25/ https://catalog.ldc.upenn.edu/LDC2017S05 https://gamayun.translatorswb.org/data/ https://iwslt.org/2021/low-resource |              |
 | jdcinal                 | Japanese Dialogue Corpus of Information Navigation and Attentive Listening Annotated with Extended ISO-24617-2 Dialogue Act Tags              | SLU               | JPN                   | http://www.lrec-conf.org/proceedings/lrec2018/pdf/464.pdf http://tts.speech.cs.cmu.edu/awb/infomation_navigation_and_attentive_listening_0.2.zip |              |
 | jkac                    | J-KAC: Japanese Kamishibai and audiobook corpus                                         | TTS                     | JPN                  | https://sites.google.com/site/shinnosuketakamichi/research-topics/j-kac_corpus                               |              |
 | jmd                     | JMD: Japanese multi-dialect corpus for speech synthesis                                 | TTS                     | JPN                  | https://sites.google.com/site/shinnosuketakamichi/research-topics/jmd_corpus                                 |              |
 | jsss                    | JSSS: Japanese speech corpus for summarization and simplification                       | TTS                     | JPN                  | https://sites.google.com/site/shinnosuketakamichi/research-topics/jsss_corpus                                |              |
 | jsut                    | Japanese speech corpus of Saruwatari-lab., University of Tokyo                          | ASR/TTS                 | JPN                  | https://sites.google.com/site/shinnosuketakamichi/publication/jsut                                           |              |
-| jtubespeech             | Japanese YouTube Speech corpus                                                          | ASR/TTS                 | JPN                  |                                                                                                              |              |
+| jtubespeech             | Japanese YouTube Speech corpus                                                          | ASR/TTS                 | JPN                  |                                                                                                             |              |
 | jv_openslr35            | Javanese                                                                                | ASR                     | JAV                  | http://www.openslr.org/35                                                                                    |              |
 | jvs                     | JVS (Japanese versatile speech) corpus                                                  | TTS                     | JPN                  | https://sites.google.com/site/shinnosuketakamichi/research-topics/jvs_corpus                                 |              |
 | ksponspeech             | KsponSpeech (Korean spontaneous speech) corpus                                          | ASR                     | KOR                  | https://aihub.or.kr/aidata/105                                                                               |              |
@@ -49,26 +50,29 @@ See: https://espnet.github.io/espnet/espnet2_tutorial.html#recipes-using-espnet2
 | librimix                | LibriMix: An Open-Source Dataset for Generalizable Speech Separation                    | SE                      | ENG                  | https://github.com/JorisCos/LibriMix                                                                         |              |
 | librispeech             | LibriSpeech ASR corpus                                                                  | ASR                     | ENG                  | http://www.openslr.org/12                                                                                    |              |
 | librispeech_100         | LibriSpeech ASR corpus 100h subset                                                      | ASR                     | ENG                  | http://www.openslr.org/12                                                                                    |              |
-| libritts                | LibriTTS corpus                                                                         | TTS                     | ENG                 | http://www.openslr.org/60                                                                                    |              |
+| libritts                | LibriTTS corpus                                                                         | TTS                     | ENG                  | http://www.openslr.org/60                                                                                    |              |
 | ljspeech                | The LJ Speech Dataset                                                                   | TTS                     | ENG                  | https://keithito.com/LJ-Speech-Dataset/                                                                      |              |
+| lrs3                    | The Oxford-BBC Lip Reading Sentences 3 (LRS3) Dataset                                   | ASR                     | ENG                  | https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs3.html                                                  |              |
 | lrs2                    | The Oxford-BBC Lip Reading Sentences 2 (LRS2) Dataset                                   | Lipreading/ASR          | ENG                  | https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html                                                  |              |
 | mini_an4                | Mini version of CMU AN4 database for the integration test                               | ASR/TTS/SE              | ENG                  | http://www.speech.cs.cmu.edu/databases/an4/                                                                  |              |
 | mini_librispeech        | Mini version of Librispeech corpus                                                      | DIAR                    | ENG                  | https://openslr.org/31/                                                                                      |              |
-| mls                     | MLS (A large multilingual corpus derived from LibriVox audiobooks)                      | ASR                     | 8 languages           | http://www.openslr.org/94/                                                                                   |              |
+| mls                     | MLS (A large multilingual corpus derived from LibriVox audiobooks)                      | ASR                     | 8 languages          | http://www.openslr.org/94/                                                                                   |              |
+| mr_openslr64            | OpenSLR Marathi Corpus                                                                  | ASR                     | MAR                  | http://www.openslr.org/64/                                                                                   |              |
+| ms_indic_is18           | Microsoft Speech Corpus (Indian languages)                                              | ASR                     | 3 langs: TEL TAM GUJ | https://msropendata.com/datasets/7230b4b1-912d-400e-be58-f84e0512985e                                        |              |
 | nsc                     | National Speech Corpus                                                                  | ASR                     | ENG-SG               | https://www.imda.gov.sg/programme-listing/digital-services-lab/national-speech-corpus                        |              |
-| open_li52               | Corpus combination with 52 languages(Commonvocie + voxforge)                            | Multilingual ASR        | 52 languages          |                                                                                                              |              |
+| open_li52               | Corpus combination with 52 languages(Commonvocie + voxforge)                            | Multilingual ASR        | 52 languages         |                                                                                                             |              |
 | polyphone_swiss_french  | Swiss French Polyphone corpus                                                           | ASR                     | FRA                  | http://catalog.elra.info/en-us/repository/browse/ELRA-S0030_02                                               |              |
 | primewords_chinese      | Primewords Chinese Corpus Set 1                                                         | ASR                     | CMN                  | https://www.openslr.org/47/                                                                                  |              |
-| puebla_nahuatl          | Highland Puebla Nahuatl corpus (endangered language in central Mexico)                  | ASR                     | HPN                   | https://www.openslr.org/92/                                                                                  |              |
+| puebla_nahuatl          | Highland Puebla Nahuatl corpus (endangered language in central Mexico)                  | ASR                     | HPN                  | https://www.openslr.org/92/                                                                                  |              |
 | reverb                  | REVERB (REverberant Voice Enhancement and Recognition Benchmark) challenge              | ASR                     | ENG                  | https://reverb2014.dereverberation.com/                                                                      |              |
 | ru_open_stt             | Russian Open Speech To Text (STT/ASR) Dataset                                           | ASR                     | RUS                  | https://github.com/snakers4/open_stt                                                                         |              |
 | ruslan                  | RUSLAN: Russian Spoken Language Corpus For Speech Synthesis                             | TTS                     | RUS                  | https://ruslan-corpus.github.io/                                                                             |              |
 | snips                   | SNIPS: A dataset for spoken language understanding                                      | SLU                     | ENG                  | https://github.com/sonos/spoken-language-understanding-research-datasets                                     |              |
 | seame                   | SEAME: a Mandarin-English Code-switching Speech Corpus in South-East Asia               | ASR                     | ENG + CMN            | https://catalog.ldc.upenn.edu/LDC2015S04                                                                     |              |
-| siwis                   | SIWIS: Spoken Interaction with Interpretation in Switzerland                            | TTS                     | FRA                  | https://https://datashare.ed.ac.uk/handle/10283/2353                                                         |              |
+| siwis                   | SIWIS: Spoken Interaction with Interpretation in Switzerland                            | TTS                     | FRA                  | https://https://datashare.ed.ac.uk/handle/10283/2353                                                                 |              |
 | slue-voxceleb           | SLUE: Spoken Language Understanding Evaluation                                          | SLU                     | ENG                  | https://github.com/asappresearch/slue-toolkit                                                                |              |
 | slurp                   | SLURP: A Spoken Language Understanding Resource Package                                 | SLU                     | ENG                  | https://github.com/pswietojanski/slurp                                                                       |              |
-| slurp_entity            | SLURP: A Spoken Language Understanding Resource Package                                 | SLU/Entity Classification | ENG                  | https://github.com/pswietojanski/slurp                                                                     |              |
+| slurp_entity            | SLURP: A Spoken Language Understanding Resource Package                                 | SLU/Entity Classifi.    | ENG                  | https://github.com/pswietojanski/slurp                                                                       |              |
 | sms_wsj                 | SMS-WSJ: A database for in-depth analysis of multi-channel source separation algorithms | SE                      | ENG                  | https://github.com/fgnt/sms_wsj                                                                              |              |
 | speechcommands          | Speech Commands: A Dataset for Limited-Vocabulary Speech Recognition                    | SLU                     | ENG                  | https://www.tensorflow.org/datasets/catalog/speech_commands                                                  |              |
 | spgispeech              | SPGISpeech 5k corpus                                                                    | ASR                     | ENG                  | https://datasets.kensho.com/datasets/scribe                                                                  |              |
@@ -79,12 +83,12 @@ See: https://espnet.github.io/espnet/espnet2_tutorial.html#recipes-using-espnet2
 | tedlium2                | TED-LIUM corpus release 2                                                               | ASR                     | ENG                  | https://www.openslr.org/19/, http://www.lrec-conf.org/proceedings/lrec2014/pdf/1104_Paper.pdf                |              |
 | thchs30                 | A Free Chinese Speech Corpus Released by CSLT@Tsinghua University                       | TTS                     | CMN                  | https://www.openslr.org/18/                                                                                  |              |
 | timit                   | TIMIT Acoustic-Phonetic Continuous Speech Corpus                                        | ASR                     | ENG                  | https://catalog.ldc.upenn.edu/LDC93S1                                                                        |              |
-| totonac                 | Highland Totonac corpus (endangered language in central Mexico)                         | ASR                     | TOS                  | http://www.openslr.org/107/                                                                                   |              |
-| tsukuyomi               | つくよみちゃんコーパス                                       | TTS                     | JPN                 | https://tyc.rei-yumesaki.net/material/corpus                                                                   |              |
-| vctk                    | English Multi-speaker Corpus for CSTR Voice Cloning Toolkit                             | TTS                     | ENG                  | http://www.udialogue.org/download/cstr-vctk-corpus.html                                                      |              |
+| totonac                 | Highland Totonac corpus (endangered language in central Mexico)                         | ASR                     | TOS                  | http://www.openslr.org/107/                                                                                  |              |
+| tsukuyomi               | つくよみちゃんコーパス                                                                      | TTS                     | JPN                  | https://tyc.rei-yumesaki.net/material/corpus                                                                 |              |
+| vctk                    | English Multi-speaker Corpus for CSTR Voice Cloning Toolkit                             | ASR/TTS                 | ENG                  | http://www.udialogue.org/download/cstr-vctk-corpus.html                                                      |              |
 | vctk_noisyreverb        | Noisy reverberant speech database (48kHz)                                               | SE                      | ENG                  | https://datashare.ed.ac.uk/handle/10283/2826                                                                 |              |
 | vivos                   | VIVOS (Vietnamese corpus for ASR)                                                       | ASR                     | VIE                  | https://ailab.hcmus.edu.vn/vivos/                                                                            |              |
-| voxforge                | VoxForge                                                                                | ASR                     | 7 languages          | http://www.voxforge.org/                                                                                      |              |
+| voxforge                | VoxForge                                                                                | ASR                     | 7 languages          | http://www.voxforge.org/                                                                                     |              |
 | wenetspeech             | WenetSpeech: A 10000+ Hours Multi-domain Chinese Corpus for Speech Recognition          | ASR                     | CMN                  | https://wenet-e2e.github.io/WenetSpeech/                                                                     |              |
 | wham                    | The WSJ0 Hipster Ambient Mixtures (WHAM!) dataset                                       | SE                      | ENG                  | https://wham.whisper.ai/                                                                                     |              |
 | whamr                   | WHAMR!: Noisy and Reverberant Single-Channel Speech Separation                          | SE                      | ENG                  | https://wham.whisper.ai/                                                                                     |              |
@@ -94,3 +98,4 @@ See: https://espnet.github.io/espnet/espnet2_tutorial.html#recipes-using-espnet2
 | yesno                   | The "yesno" corpus                                                                      | ASR                     | HEB                  | http://www.openslr.org/1                                                                                     |              |
 | yoloxochitl_mixtec      | Yoloxochitl-Mixtec corpus (endangered language in central Mexico)                       | ASR                     | XTY                  | http://www.openslr.org/89                                                                                    |              |
 | zeroth_korean           | Zeroth-Korean                                                                           | ASR                     | KOR                  | http://www.openslr.org/40                                                                                    |              |
+| zh_openslr38            | ST-CMDS-20170001_1, Free ST Chinese Mandarin Corpus                                     | ASR                     | CMN                  | http://www.openslr.org/38                                                                                    |              |
\ No newline at end of file
diff --git a/egs2/TEMPLATE/asr1/asr.sh b/egs2/TEMPLATE/asr1/asr.sh
index 04f7578b5b0..f4d7a8ad24a 100755
--- a/egs2/TEMPLATE/asr1/asr.sh
+++ b/egs2/TEMPLATE/asr1/asr.sh
@@ -110,6 +110,8 @@ k2_config=./conf/decode_asr_transformer_with_k2.yaml
 
 use_streaming=false # Whether to use streaming decoding
 
+use_maskctc=false # Whether to use maskctc decoding
+
 batch_size=1
 inference_tag=    # Suffix to the result dir for decoding.
 inference_config= # Config for decoding.
@@ -224,6 +226,7 @@ Options:
     --inference_asr_model # ASR model path for decoding (default="${inference_asr_model}").
     --download_model      # Download a model from Model Zoo and use it for decoding (default="${download_model}").
     --use_streaming       # Whether to use streaming decoding (default="${use_streaming}").
+    --use_maskctc         # Whether to use maskctc decoding (default="${use_streaming}").
 
     # [Task dependent] Set the datadir name created by local/data.sh
     --train_set     # Name of training set (required).
@@ -895,7 +898,7 @@ if ! "${skip_train}"; then
         if "${use_ngram}"; then
             log "Stage 9: Ngram Training: train_set=${data_feats}/lm_train.txt"
             cut -f 2- -d " " ${data_feats}/lm_train.txt | lmplz -S "20%" --discount_fallback -o ${ngram_num} - >${ngram_exp}/${ngram_num}gram.arpa
-            build_binary -s ${ngram_exp}/${ngram_num}gram.arpa ${ngram_exp}/${ngram_num}gram.bin 
+            build_binary -s ${ngram_exp}/${ngram_num}gram.arpa ${ngram_exp}/${ngram_num}gram.bin
         else
             log "Stage 9: Skip ngram stages: use_ngram=${use_ngram}"
         fi
@@ -1195,6 +1198,8 @@ if ! "${skip_eval}"; then
         else
           if "${use_streaming}"; then
               asr_inference_tool="espnet2.bin.asr_inference_streaming"
+          elif "${use_maskctc}"; then
+              asr_inference_tool="espnet2.bin.asr_inference_maskctc"
           else
               asr_inference_tool="espnet2.bin.asr_inference"
           fi
diff --git a/egs2/TEMPLATE/asr1/db.sh b/egs2/TEMPLATE/asr1/db.sh
index 3785aef57a8..31008b9502c 100755
--- a/egs2/TEMPLATE/asr1/db.sh
+++ b/egs2/TEMPLATE/asr1/db.sh
@@ -11,6 +11,7 @@ DIRHA_ENGLISH_PHDEV=
 DIRHA_WSJ=
 DIRHA_WSJ_PROCESSED="${PWD}/data/local/dirha_wsj_processed"  # Output file path
 DNS=
+DSING=downloads
 WSJ0=
 WSJ1=
 WSJCAM0=
@@ -107,6 +108,7 @@ GOOGLEI18N=downloads
 NOISY_SPEECH=
 NOISY_REVERBERANT_SPEECH=
 LRS2=
+LRS3=
 SUNDA=downloads
 CMU_ARCTIC=downloads
 CMU_INDIC=downloads
@@ -126,6 +128,9 @@ PRIMEWORDS_CHINESE=downloads
 SEAME=
 BENGALI=downloads
 IWSLT14=
+ST_CMDS=downloads
+MS_INDIC_IS18=
+MARATHI=downloads
 
 # For only CMU TIR environment
 if [[ "$(hostname)" == tir* ]]; then
@@ -159,6 +164,8 @@ if [[ "$(hostname)" == tir* ]]; then
     IWSLT22_DIALECT=/projects/tir5/data/speech_corpora/LDC2022E01_IWSLT22_Tunisian_Arabic_Shared_Task_Training_Data/
     PRIMEWORDS_CHINESE=/projects/tir5/data/speech_corpora/Primewords_Chinese
     FISHER_CALLHOME_SPANISH=/projects/tir5/data/speech_corpora/fisher_callhome_spanish
+    DSING=/projects/tir5/data/speech_corpora/sing_300x30x2
+    MS_INDIC_IS18=/projects/tir6/general/cnariset/corpora/microsoft_speech_corpus_indian_languages
 fi
 
 # For only JHU environment
diff --git a/egs2/TEMPLATE/asr1/pyscripts/utils/score_intent.py b/egs2/TEMPLATE/asr1/pyscripts/utils/score_intent.py
index 13354637d52..4f0f074c9db 100755
--- a/egs2/TEMPLATE/asr1/pyscripts/utils/score_intent.py
+++ b/egs2/TEMPLATE/asr1/pyscripts/utils/score_intent.py
@@ -12,7 +12,7 @@
 import argparse
 
 
-def get_classification_result(hyp_file, ref_file):
+def get_classification_result(hyp_file, ref_file, hyp_write, ref_write):
     hyp_lines = [line for line in hyp_file]
     ref_lines = [line for line in ref_file]
 
@@ -22,6 +22,16 @@ def get_classification_result(hyp_file, ref_file):
         ref_intent = ref_lines[line_count].split(" ")[0]
         if hyp_intent != ref_intent:
             error += 1
+        hyp_write.write(
+            " ".join(hyp_lines[line_count].split("\t")[0].split(" ")[1:])
+            + "\t"
+            + hyp_lines[line_count].split("\t")[1]
+        )
+        ref_write.write(
+            " ".join(ref_lines[line_count].split("\t")[0].split(" ")[1:])
+            + "\t"
+            + ref_lines[line_count].split("\t")[1]
+        )
     return 1 - (error / len(hyp_lines))
 
 
@@ -56,7 +66,16 @@ def get_classification_result(hyp_file, ref_file):
     os.path.join(exp_root, valid_inference_folder + "score_wer/ref.trn")
 )
 
-result = get_classification_result(valid_hyp_file, valid_ref_file)
+valid_hyp_write_file = open(
+    os.path.join(exp_root, valid_inference_folder + "score_wer/hyp_asr.trn"), "w"
+)
+valid_ref_write_file = open(
+    os.path.join(exp_root, valid_inference_folder + "score_wer/ref_asr.trn"), "w"
+)
+
+result = get_classification_result(
+    valid_hyp_file, valid_ref_file, valid_hyp_write_file, valid_ref_write_file
+)
 print("Valid Intent Classification Result")
 print(result)
 
@@ -66,8 +85,16 @@ def get_classification_result(hyp_file, ref_file):
 test_ref_file = open(
     os.path.join(exp_root, test_inference_folder + "score_wer/ref.trn")
 )
+test_hyp_write_file = open(
+    os.path.join(exp_root, test_inference_folder + "score_wer/hyp_asr.trn"), "w"
+)
+test_ref_write_file = open(
+    os.path.join(exp_root, test_inference_folder + "score_wer/ref_asr.trn"), "w"
+)
 
-result = get_classification_result(test_hyp_file, test_ref_file)
+result = get_classification_result(
+    test_hyp_file, test_ref_file, test_hyp_write_file, test_ref_write_file
+)
 print("Test Intent Classification Result")
 print(result)
 
@@ -79,6 +106,17 @@ def get_classification_result(hyp_file, ref_file):
     utt_test_ref_file = open(
         os.path.join(exp_root, utt_test_inference_folder + "score_wer/ref.trn")
     )
-    result = get_classification_result(utt_test_hyp_file, utt_test_ref_file)
+    utt_test_hyp_write_file = open(
+        os.path.join(exp_root, utt_test_inference_folder + "score_wer/hyp_asr.trn"), "w"
+    )
+    utt_test_ref_write_file = open(
+        os.path.join(exp_root, utt_test_inference_folder + "score_wer/ref_asr.trn"), "w"
+    )
+    result = get_classification_result(
+        utt_test_hyp_file,
+        utt_test_ref_file,
+        utt_test_hyp_write_file,
+        utt_test_ref_write_file,
+    )
     print("Unseen Utterance Test Intent Classification Result")
     print(result)
diff --git a/egs2/TEMPLATE/asr1/pyscripts/utils/score_summarization.py b/egs2/TEMPLATE/asr1/pyscripts/utils/score_summarization.py
new file mode 100644
index 00000000000..35202f1ce88
--- /dev/null
+++ b/egs2/TEMPLATE/asr1/pyscripts/utils/score_summarization.py
@@ -0,0 +1,50 @@
+import sys
+import os
+from datasets import load_metric
+import numpy as np
+from nlgeval import compute_metrics
+from nlgeval import NLGEval
+
+
+ref_file = sys.argv[1]
+hyp_file = sys.argv[2]
+
+with open(ref_file, "r") as f:
+    ref_dict = {
+        line.strip().split(" ")[0]: " ".join(line.strip().split(" ")[1:])
+        for line in f.readlines()
+    }
+
+with open(hyp_file, "r") as f:
+    hyp_dict = {
+        line.strip().split(" ")[0]: " ".join(line.strip().split(" ")[1:])
+        for line in f.readlines()
+    }
+
+keys = [k for k, v in hyp_dict.items()]
+labels = [ref_dict[k] for k, _ in hyp_dict.items()]
+decoded_preds = [v for k, v in hyp_dict.items()]
+
+metric = load_metric("bertscore")
+result_bert = metric.compute(
+    predictions=decoded_preds,
+    references=labels,
+    lang="en",
+)
+
+
+nlg = NLGEval()  # loads the models
+print("Key", "\t", "METEOR", "\t", "ROUGE-L")
+for (key, ref, hyp) in zip(keys, labels, decoded_preds):
+    metrics_dict = nlg.compute_individual_metrics([ref], hyp)
+    print(key, "\t", metrics_dict["METEOR"], "\t", metrics_dict["ROUGE_L"])
+refs = [[x] for x in labels]
+metrics_dict = nlg.compute_metrics(ref_list=[labels], hyp_list=decoded_preds)
+metric = load_metric("rouge")
+result = metric.compute(predictions=decoded_preds, references=labels)
+result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
+
+print(
+    f"RESULT {result['rouge1']} {result['rouge2']} {result['rougeL']} \
+    {metrics_dict['METEOR']*100.0} {100*np.mean(result_bert['precision'])}"
+)
diff --git a/egs2/TEMPLATE/asr1/scripts/utils/show_asr_result.sh b/egs2/TEMPLATE/asr1/scripts/utils/show_asr_result.sh
index afa768bf5d5..9b8abb9d658 100755
--- a/egs2/TEMPLATE/asr1/scripts/utils/show_asr_result.sh
+++ b/egs2/TEMPLATE/asr1/scripts/utils/show_asr_result.sh
@@ -44,7 +44,16 @@ cat << EOF
 EOF
 
 while IFS= read -r expdir; do
-    if ls "${expdir}"/*/*/score_*/result.txt &> /dev/null; then
+    
+      if ls "${expdir}"/*/*/result.sum &> /dev/null; then
+	echo "## $(basename ${expdir})"
+	cat << EOF
+|dataset|ROUGE-1|ROUGE-2|ROUGE-L|METEOR|BERTScore|
+|---|---|---|---|---|---|
+EOF
+	grep -H -e "RESULT" "${expdir}"/*/*/result.sum | sed 's=RESULT==g' |  cut -d ' ' -f 1,2- | tr ' ' '|'
+	echo  
+      elif ls "${expdir}"/*/*/score_*/result.txt &> /dev/null; then
         echo "## $(basename ${expdir})"
         for type in wer cer ter; do
                 	cat << EOF
diff --git a/egs2/TEMPLATE/mt1/mt.sh b/egs2/TEMPLATE/mt1/mt.sh
index 6164c155558..35c6ab276c3 100755
--- a/egs2/TEMPLATE/mt1/mt.sh
+++ b/egs2/TEMPLATE/mt1/mt.sh
@@ -1165,37 +1165,54 @@ if ! "${skip_eval}"; then
             _scoredir="${_dir}/score_bleu"
             mkdir -p "${_scoredir}"
 
-            paste \
-                <(<"${_data}/text.${tgt_case}.${tgt_lang}" \
-                    ${python} -m espnet2.bin.tokenize_text  \
-                        -f 2- --input - --output - \
-                        --token_type word \
-                        --non_linguistic_symbols "${nlsyms_txt}" \
-                        --remove_non_linguistic_symbols true \
-                        --cleaner "${cleaner}" \
-                        ) \
-                <(<"${_data}/text.${tgt_case}.${tgt_lang}" awk '{ print "(" $2 "-" $1 ")" }') \
-                    >"${_scoredir}/ref.trn.org"
+            <"${_data}/text.${tgt_case}.${tgt_lang}" \
+                ${python} -m espnet2.bin.tokenize_text  \
+                    -f 2- --input - --output - \
+                    --token_type word \
+                    --non_linguistic_symbols "${nlsyms_txt}" \
+                    --remove_non_linguistic_symbols true \
+                    --cleaner "${cleaner}" \
+            >"${_scoredir}/ref.trn"
+
+            #paste \
+            #    <(<"${_data}/text.${tgt_case}.${tgt_lang}" \
+            #        ${python} -m espnet2.bin.tokenize_text  \
+            #            -f 2- --input - --output - \
+            #            --token_type word \
+            #            --non_linguistic_symbols "${nlsyms_txt}" \
+            #            --remove_non_linguistic_symbols true \
+            #            --cleaner "${cleaner}" \
+            #            ) \
+            #    <(<"${_data}/text.${tgt_case}.${tgt_lang}" awk '{ print "(" $2 "-" $1 ")" }') \
+            #        >"${_scoredir}/ref.trn.org"
 
             # NOTE(kamo): Don't use cleaner for hyp
-            paste \
-                <(<"${_dir}/text"  \
-                        ${python} -m espnet2.bin.tokenize_text  \
-                            -f 2- --input - --output - \
-                            --token_type word \
-                            --non_linguistic_symbols "${nlsyms_txt}" \
-                            --remove_non_linguistic_symbols true \
-                            ) \
-                <(<"${_data}/text.${tgt_case}.${tgt_lang}" awk '{ print "(" $2 "-" $1 ")" }') \
-                    >"${_scoredir}/hyp.trn.org"
+            <"${_dir}/text"  \
+                    ${python} -m espnet2.bin.tokenize_text  \
+                    -f 2- --input - --output - \
+                    --token_type word \
+                    --non_linguistic_symbols "${nlsyms_txt}" \
+                    --remove_non_linguistic_symbols true \
+            >"${_scoredir}/hyp.trn"
+
+            #paste \
+            #    <(<"${_dir}/text"  \
+            #            ${python} -m espnet2.bin.tokenize_text  \
+            #                -f 2- --input - --output - \
+            #                --token_type word \
+            #                --non_linguistic_symbols "${nlsyms_txt}" \
+            #                --remove_non_linguistic_symbols true \
+            #                ) \
+            #    <(<"${_data}/text.${tgt_case}.${tgt_lang}" awk '{ print "(" $2 "-" $1 ")" }') \
+            #        >"${_scoredir}/hyp.trn.org"
             
             # remove utterance id
-            perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/ref.trn.org" > "${_scoredir}/ref.trn"
-            perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/hyp.trn.org" > "${_scoredir}/hyp.trn"
+            #perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/ref.trn.org" > "${_scoredir}/ref.trn"
+            #perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/hyp.trn.org" > "${_scoredir}/hyp.trn"
 
             # detokenizer
-            detokenizer.perl -l en -q < "${_scoredir}/ref.trn" > "${_scoredir}/ref.trn.detok"
-            detokenizer.perl -l en -q < "${_scoredir}/hyp.trn" > "${_scoredir}/hyp.trn.detok"
+            detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/ref.trn" > "${_scoredir}/ref.trn.detok"
+            detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/hyp.trn" > "${_scoredir}/hyp.trn.detok"
 
             if [ ${tgt_case} = "tc" ]; then
                 echo "Case sensitive BLEU result (single-reference)" >> ${_scoredir}/result.tc.txt
@@ -1238,7 +1255,7 @@ if ! "${skip_eval}"; then
                     
                     # 
                     perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/ref.trn.org.${ref_idx}" > "${_scoredir}/ref.trn.${ref_idx}"
-                    detokenizer.perl -l en -q < "${_scoredir}/ref.trn.${ref_idx}" > "${_scoredir}/ref.trn.detok.${ref_idx}"
+                    detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/ref.trn.${ref_idx}" > "${_scoredir}/ref.trn.detok.${ref_idx}"
                     remove_punctuation.pl < "${_scoredir}/ref.trn.detok.${ref_idx}" > "${_scoredir}/ref.trn.detok.lc.rm.${ref_idx}"
                     case_sensitive_refs="${case_sensitive_refs} ${_scoredir}/ref.trn.detok.${ref_idx}"
                     case_insensitive_refs="${case_insensitive_refs} ${_scoredir}/ref.trn.detok.lc.rm.${ref_idx}"
diff --git a/egs2/TEMPLATE/ssl1/pyscripts/dump_km_label.py b/egs2/TEMPLATE/ssl1/pyscripts/dump_km_label.py
index 6b14ac4ec97..552c84f89ad 100644
--- a/egs2/TEMPLATE/ssl1/pyscripts/dump_km_label.py
+++ b/egs2/TEMPLATE/ssl1/pyscripts/dump_km_label.py
@@ -39,13 +39,13 @@ class ApplyKmeans(object):
     def __init__(self, km_path):
         self.km_model = joblib.load(km_path)
         self.nc = self.km_model.cluster_centers_.transpose()
-        self.nc_norm = (self.nc ** 2).sum(0, keepdims=True)
+        self.nc_norm = (self.nc**2).sum(0, keepdims=True)
 
     def __call__(self, x):
         if isinstance(x, torch.Tensor):
             x = x.cpu().numpy()
         probs = (
-            (x ** 2).sum(1, keepdims=True) - 2 * np.matmul(x, self.nc) + self.nc_norm
+            (x**2).sum(1, keepdims=True) - 2 * np.matmul(x, self.nc) + self.nc_norm
         )
         return np.argmin(probs, axis=1)
 
diff --git a/egs2/TEMPLATE/st1/st.sh b/egs2/TEMPLATE/st1/st.sh
index 93ffe4d3cf5..9867f341f88 100755
--- a/egs2/TEMPLATE/st1/st.sh
+++ b/egs2/TEMPLATE/st1/st.sh
@@ -296,18 +296,8 @@ fi
 # Extra files for translation process
 utt_extra_files="text.${src_case}.${src_lang} text.${tgt_case}.${tgt_lang}"
 # Use the same text as ST for bpe training if not specified.
-if "${token_joint}"; then
-    # if token_joint, the bpe training will use both src_lang and tgt_lang to train a single bpe model
-    [ -z "${src_bpe_train_text}" ] && src_bpe_train_text="${data_feats}/${train_set}/text.${src_case}.${src_lang}"
-    [ -z "${tgt_bpe_train_text}" ] && tgt_bpe_train_text="${data_feats}/${train_set}/text.${tgt_case}.${tgt_lang}"
-
-    # Prepare data as text.${src_lang}_${tgt_lang})
-    cat $src_bpe_train_text $tgt_bpe_train_text > ${data_feats}/${train_set}/text.${src_lang}_${tgt_lang}
-    tgt_bpe_train_text="${data_feats}/${train_set}/text.${src_lang}_${tgt_lang}"
-else
-    [ -z "${src_bpe_train_text}" ] && src_bpe_train_text="${data_feats}/${train_set}/text.${src_case}.${src_lang}"
-    [ -z "${tgt_bpe_train_text}" ] && tgt_bpe_train_text="${data_feats}/${train_set}/text.${tgt_case}.${tgt_lang}"
-fi
+[ -z "${src_bpe_train_text}" ] && src_bpe_train_text="${data_feats}/${train_set}/text.${src_case}.${src_lang}"
+[ -z "${tgt_bpe_train_text}" ] && tgt_bpe_train_text="${data_feats}/${train_set}/text.${tgt_case}.${tgt_lang}"
 # Use the same text as ST for lm training if not specified.
 [ -z "${lm_train_text}" ] && lm_train_text="${data_feats}/${train_set}/text.${tgt_case}.${tgt_lang}"
 # Use the same text as ST for lm training if not specified.
@@ -743,6 +733,16 @@ if ! "${skip_data_prep}"; then
     fi
 
     if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+        # Combine source and target texts when using joint tokenization
+        if "${token_joint}"; then
+            log "Merge src and target data if joint BPE"
+
+            cat $tgt_bpe_train_text > ${data_feats}/${train_set}/text.${src_lang}_${tgt_lang}
+            [ ! -z "${src_bpe_train_text}" ] && cat ${src_bpe_train_text} >> ${data_feats}/${train_set}/text.${src_lang}_${tgt_lang}
+            # Set the new text as the target text
+            tgt_bpe_train_text="${data_feats}/${train_set}/text.${src_lang}_${tgt_lang}"
+        fi
+
         # First generate tgt lang
         if [ "${tgt_token_type}" = bpe ]; then
             log "Stage 5a: Generate token_list from ${tgt_bpe_train_text} using BPE for tgt_lang"
@@ -1484,8 +1484,8 @@ if ! "${skip_eval}"; then
             perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/hyp.trn.org" > "${_scoredir}/hyp.trn"
 
             # detokenizer
-            detokenizer.perl -l en -q < "${_scoredir}/ref.trn" > "${_scoredir}/ref.trn.detok"
-            detokenizer.perl -l en -q < "${_scoredir}/hyp.trn" > "${_scoredir}/hyp.trn.detok"
+            detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/ref.trn" > "${_scoredir}/ref.trn.detok"
+            detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/hyp.trn" > "${_scoredir}/hyp.trn.detok"
 
             if [ ${tgt_case} = "tc" ]; then
                 echo "Case sensitive BLEU result (single-reference)" >> ${_scoredir}/result.tc.txt
@@ -1528,7 +1528,7 @@ if ! "${skip_eval}"; then
                     
                     # 
                     perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/ref.trn.org.${ref_idx}" > "${_scoredir}/ref.trn.${ref_idx}"
-                    detokenizer.perl -l en -q < "${_scoredir}/ref.trn.${ref_idx}" > "${_scoredir}/ref.trn.detok.${ref_idx}"
+                    detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/ref.trn.${ref_idx}" > "${_scoredir}/ref.trn.detok.${ref_idx}"
                     remove_punctuation.pl < "${_scoredir}/ref.trn.detok.${ref_idx}" > "${_scoredir}/ref.trn.detok.lc.rm.${ref_idx}"
                     case_sensitive_refs="${case_sensitive_refs} ${_scoredir}/ref.trn.detok.${ref_idx}"
                     case_insensitive_refs="${case_insensitive_refs} ${_scoredir}/ref.trn.detok.lc.rm.${ref_idx}"
@@ -1551,7 +1551,7 @@ if ! "${skip_eval}"; then
         done
 
         # Show results in Markdown syntax
-        scripts/utils/show_st_result.sh --case $tgt_case "${st_exp}" > "${st_exp}"/RESULTS.md
+        scripts/utils/show_translation_result.sh --case $tgt_case "${st_exp}" > "${st_exp}"/RESULTS.md
         cat "${cat_exp}"/RESULTS.md
     fi
 else
diff --git a/egs2/bn_openslr53/asr1/README.md b/egs2/bn_openslr53/asr1/README.md
new file mode 100644
index 00000000000..542c8053339
--- /dev/null
+++ b/egs2/bn_openslr53/asr1/README.md
@@ -0,0 +1,29 @@
+# RESULTS
+## Environments
+- date: `Mon Jan 31 10:53:20 EST 2022`
+- python version: `3.9.5 (default, Jun  4 2021, 12:28:51)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.6a1`
+- pytorch version: `pytorch 1.8.1+cu102`
+- Git hash: `9d09bf551a9fe090973de60e15adec1de6b3d054`
+  - Commit date: `Fri Jan 21 11:43:15 2022 -0500`
+- Pretrained Model: https://huggingface.co/espnet/bn_openslr53
+
+## asr_train_asr_raw_bpe1000
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_batch_size1_lm_lm_train_lm_bpe1000_valid.loss.ave_asr_model_valid.acc.best/sbn_test|2018|6470|74.2|21.3|4.5|2.2|28.0|48.8|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_batch_size1_lm_lm_train_lm_bpe1000_valid.loss.ave_asr_model_valid.acc.best/sbn_test|2018|39196|89.4|4.3|6.3|1.4|12.0|48.8|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_batch_size1_lm_lm_train_lm_bpe1000_valid.loss.ave_asr_model_valid.acc.best/sbn_test|2018|15595|77.6|12.7|9.7|1.6|24.0|48.7|
+
diff --git a/egs2/chime4/enh1/README.md b/egs2/chime4/enh1/README.md
index 9ca905d08cd..886eb0cbf26 100644
--- a/egs2/chime4/enh1/README.md
+++ b/egs2/chime4/enh1/README.md
@@ -6,6 +6,7 @@
 - python version: `3.6.3 |Anaconda, Inc.| (default, Nov 20 2017, 20:41:42)  [GCC 7.2.0]`
 - espnet version: `espnet 0.9.7`
 - pytorch version: `pytorch 1.6.0`
+- Note: PESQ is evaluated based on https://github.com/vBaiCai/python-pesq
 
 
 ## enh_train_enh_conv_tasnet_raw
@@ -25,3 +26,36 @@ config: conf/tuning/train_enh_beamformer_mvdr.yaml
 |---|---|---|---|---|---|---|
 |enhanced_dt05_simu_isolated_6ch_track|2.60|0.94|13.67|13.67|0|12.51|
 |enhanced_et05_simu_isolated_6ch_track|2.63|0.95|15.51|15.51|0|14.65|
+
+<!-- These results are from the code after refactoring  -->
+## enh_train_enh_dc_crn_mapping_snr_raw
+
+config: conf/tuning/train_enh_dc_crn_mapping_snr.yaml
+
+|dataset|PESQ|STOI|SAR|SDR|SIR|SI_SNR|
+|---|---|---|---|---|---|---|
+|enhanced_dt05_simu_isolated_6ch_track|3.10|0.96|17.82|17.82|0.00|17.59|
+|enhanced_et05_simu_isolated_6ch_track|2.95|0.95|17.33|17.33|0.00|17.04|
+
+<!-- Generated by ./scripts/utils/show_enh_score.sh -->
+# RESULTS
+## Environments
+- date: `Sat Mar 19 07:17:45 CST 2022`
+- python version: `3.7.11 (default, Jul 27 2021, 14:32:16)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.8.1`
+- Git hash: `648b024d8fb262eb9923c06a698b9c6df5b16e51`
+  - Commit date: `Wed Mar 16 18:47:21 2022 +0800`
+
+
+## enh_train_enh_dprnntac_fasnet_raw
+
+config: conf/tuning/train_enh_dprnntac_fasnet.yaml
+
+Pretrained model: https://huggingface.co/lichenda/chime4_fasnet_dprnn_tac
+
+|dataset|STOI|SAR|SDR|SIR|
+|---|---|---|---|---|
+|enhanced_dt05_simu_isolated_6ch_track|0.95|15.75|15.75|0.00|
+|enhanced_et05_simu_isolated_6ch_track|0.94|15.40|15.40|0.00|
+
diff --git a/egs2/chime4/enh1/conf/tuning/train_enh_beamformer_mvdr.yaml b/egs2/chime4/enh1/conf/tuning/train_enh_beamformer_mvdr.yaml
index fc996552cd3..cee051c8ef1 100644
--- a/egs2/chime4/enh1/conf/tuning/train_enh_beamformer_mvdr.yaml
+++ b/egs2/chime4/enh1/conf/tuning/train_enh_beamformer_mvdr.yaml
@@ -53,7 +53,7 @@ separator_conf:
     bunits: 512
     bprojs: 512
     badim: 320
-    ref_channel: 4
+    ref_channel: 3
     use_noise_mask: True
     beamformer_type: mvdr_souden
     bdropout_rate: 0.0
diff --git a/egs2/chime4/enh1/conf/tuning/train_enh_dc_crn_mapping_snr.yaml b/egs2/chime4/enh1/conf/tuning/train_enh_dc_crn_mapping_snr.yaml
new file mode 100644
index 00000000000..38d61843282
--- /dev/null
+++ b/egs2/chime4/enh1/conf/tuning/train_enh_dc_crn_mapping_snr.yaml
@@ -0,0 +1,67 @@
+init: xavier_uniform
+max_epoch: 200
+batch_type: folded
+batch_size:  16
+iterator_type: chunk
+chunk_length: 32000
+num_workers: 4
+optim: adam
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 1.0e-7
+    amsgrad: true
+patience: 10
+grad_clip: 5
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: steplr
+scheduler_conf:
+    step_size: 2
+    gamma: 0.98
+
+# A list for criterions
+# The overlall loss in the multi-task learning will be:
+# loss = weight_1 * loss_1 + ... + weight_N * loss_N
+# The default `weight` for each sub-loss is 1.0
+criterions: 
+  # The first criterion
+  - name: snr 
+    conf:
+      eps: 1.0e-7
+    # the wrapper for the current criterion
+    # PIT is widely used in the speech separation task
+    wrapper: pit
+    wrapper_conf:
+      weight: 1.0
+
+
+encoder: stft
+encoder_conf:
+    n_fft: 256
+    hop_length: 128
+decoder: stft
+decoder_conf:
+    n_fft: 256
+    hop_length: 128
+separator: dc_crn
+separator_conf:
+    num_spk: 1
+    input_channels: [10, 16, 32, 64, 128, 256]  # 5x2=10 input channels
+    enc_hid_channels: 8
+    enc_layers: 5
+    glstm_groups: 2
+    glstm_layers: 2
+    glstm_bidirectional: true
+    glstm_rearrange: false
+    mode: mapping
+    ref_channel: 3
diff --git a/egs2/chime4/enh1/conf/tuning/train_enh_dprnntac_fasnet.yaml b/egs2/chime4/enh1/conf/tuning/train_enh_dprnntac_fasnet.yaml
new file mode 100644
index 00000000000..b5dd47ddac7
--- /dev/null
+++ b/egs2/chime4/enh1/conf/tuning/train_enh_dprnntac_fasnet.yaml
@@ -0,0 +1,59 @@
+optim: adam
+init: xavier_uniform
+max_epoch: 100
+batch_type: folded
+batch_size: 8 
+iterator_type: chunk
+chunk_length: 32000
+num_workers: 4
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 0
+patience: 10
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: steplr
+scheduler_conf:
+    step_size: 2
+    gamma: 0.98
+
+encoder: same
+encoder_conf: {}
+decoder: same
+decoder_conf: {}
+separator: fasnet
+separator_conf:
+    enc_dim: 64
+    feature_dim: 64
+    hidden_dim: 128
+    layer: 6
+    segment_size: 24
+    num_spk: 1
+    win_len: 16
+    context_len: 16
+    sr: 16000
+    fasnet_type: 'fasnet'
+    dropout: 0.2
+
+
+
+criterions: 
+  # The first criterion
+  - name: si_snr 
+    conf:
+      eps: 1.0e-7
+    # the wrapper for the current criterion
+    # for single-talker case, we simplely use fixed_order wrapper
+    wrapper: fixed_order
+    wrapper_conf:
+      weight: 1.0
diff --git a/egs2/chime4/enh1/conf/tuning/train_enh_dprnntac_ifasnet.yaml b/egs2/chime4/enh1/conf/tuning/train_enh_dprnntac_ifasnet.yaml
new file mode 100644
index 00000000000..ef1349ad8b9
--- /dev/null
+++ b/egs2/chime4/enh1/conf/tuning/train_enh_dprnntac_ifasnet.yaml
@@ -0,0 +1,58 @@
+optim: adam
+init: xavier_uniform
+max_epoch: 100
+batch_type: folded
+batch_size: 8 
+iterator_type: chunk
+chunk_length: 32000
+num_workers: 4
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 0
+patience: 10
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: steplr
+scheduler_conf:
+    step_size: 2
+    gamma: 0.98
+
+encoder: same
+encoder_conf: {}
+decoder: same
+decoder_conf: {}
+separator: fasnet
+separator_conf:
+    enc_dim: 64
+    feature_dim: 64
+    hidden_dim: 128
+    layer: 6
+    segment_size: 24
+    num_spk: 1
+    win_len: 16
+    context_len: 16
+    sr: 16000
+    fasnet_type: 'ifasnet'
+
+
+
+criterions: 
+  # The first criterion
+  - name: si_snr 
+    conf:
+      eps: 1.0e-7
+    # the wrapper for the current criterion
+    # for single-talker case, we simplely use fixed_order wrapper
+    wrapper: fixed_order
+    wrapper_conf:
+      weight: 1.0
diff --git a/egs2/chime4/enh1/local/simu_ext_chime4_data_prep.sh b/egs2/chime4/enh1/local/simu_ext_chime4_data_prep.sh
index 08df7d0dc4c..5cd50773aeb 100755
--- a/egs2/chime4/enh1/local/simu_ext_chime4_data_prep.sh
+++ b/egs2/chime4/enh1/local/simu_ext_chime4_data_prep.sh
@@ -85,6 +85,8 @@ elif [[ "$track" == "6" ]]; then
   done
 
   for x in $list_set; do
+    # drop the second channel to follow the convention in CHiME-4
+    # see P27 in https://hal.inria.fr/hal-01399180/file/vincent_CSL16.pdf
     mix-mono-wav-scp.py ${x}_wav.CH{1,3,4,5,6}.scp > ${x}_wav.scp
     mix-mono-wav-scp.py ${x}_spk1_wav.CH{1,3,4,5,6}.scp > ${x}_spk1_wav.scp
     sed -E "s#\.Clean\.wav#\.Noise\.wav#g" ${x}_spk1_wav.scp > ${x}_noise_wav.scp
diff --git a/egs2/chime4/enh1/run.sh b/egs2/chime4/enh1/run.sh
index cf95ee85954..60ee54ec435 100755
--- a/egs2/chime4/enh1/run.sh
+++ b/egs2/chime4/enh1/run.sh
@@ -25,7 +25,7 @@ test_sets="et05_simu_isolated_1ch_track"
     --fs ${sample_rate} \
     --ngpu 2 \
     --spk_num 1 \
-    --ref_channel 4 \
+    --ref_channel 3 \
     --local_data_opts "--extra-annotations ${extra_annotations} --stage 1 --stop-stage 2" \
     --enh_config conf/tuning/train_enh_conv_tasnet.yaml \
     --use_dereverb_ref false \
diff --git a/egs2/dsing/asr1/RESULTS.md b/egs2/dsing/asr1/RESULTS.md
new file mode 100644
index 00000000000..0cdd661e049
--- /dev/null
+++ b/egs2/dsing/asr1/RESULTS.md
@@ -0,0 +1,55 @@
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+# RESULTS
+## Environments
+- date: `Sat Mar 19 23:02:37 EDT 2022`
+- python version: `3.9.7 (default, Sep 16 2021, 13:09:58)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.10.1`
+- Git hash: `c1ed71c6899e54c0b3dad82687886b1183cd0885`
+  - Commit date: `Wed Mar 16 23:34:49 2022 -0400`
+
+## asr_train_asr_conformer7_hubert_ll60k_large_raw_bpe500_sp
+- model:  https://huggingface.co/espnet/ftshijt_espnet2_asr_dsing_hubert_conformer
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_bpe500_valid.loss.ave_asr_model_latest/dev|482|4018|83.6|9.4|7.0|6.4|22.8|58.3|
+|decode_asr_lm_lm_train_lm_bpe500_valid.loss.ave_asr_model_latest/test|480|4632|81.4|12.3|6.3|4.5|23.1|52.1|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_bpe500_valid.loss.ave_asr_model_latest/dev|482|18692|88.5|3.1|8.4|5.9|17.4|58.3|
+|decode_asr_lm_lm_train_lm_bpe500_valid.loss.ave_asr_model_latest/test|480|21787|87.9|4.3|7.8|4.5|16.6|52.1|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_bpe500_valid.loss.ave_asr_model_latest/dev|482|6097|82.2|7.1|10.7|5.7|23.5|58.3|
+|decode_asr_lm_lm_train_lm_bpe500_valid.loss.ave_asr_model_latest/test|480|7736|81.7|9.2|9.1|4.0|22.3|52.1|
+
+## asr_train_asr_raw_bpe500_sp
+- model: https://huggingface.co/espnet/ftshijt_espnet2_asr_dsing_transformer
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_bpe500_valid.loss.ave_asr_model_valid.acc.ave/dev|482|4018|77.0|16.2|6.8|4.0|27.0|65.1|
+|decode_asr_lm_lm_train_lm_bpe500_valid.loss.ave_asr_model_valid.acc.ave/test|480|4632|76.1|17.3|6.6|3.7|27.6|57.7|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_bpe500_valid.loss.ave_asr_model_valid.acc.ave/dev|482|18692|85.0|5.8|9.2|4.2|19.2|65.1|
+|decode_asr_lm_lm_train_lm_bpe500_valid.loss.ave_asr_model_valid.acc.ave/test|480|21787|84.9|6.3|8.8|4.2|19.3|57.7|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_bpe500_valid.loss.ave_asr_model_valid.acc.ave/dev|482|6097|75.2|12.8|12.0|4.1|28.9|65.1|
+|decode_asr_lm_lm_train_lm_bpe500_valid.loss.ave_asr_model_valid.acc.ave/test|480|7736|75.3|14.3|10.4|4.1|28.8|57.7|
\ No newline at end of file
diff --git a/egs2/dsing/asr1/asr.sh b/egs2/dsing/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/dsing/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/dsing/asr1/cmd.sh b/egs2/dsing/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/dsing/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/dsing/asr1/conf/decode_asr.yaml b/egs2/dsing/asr1/conf/decode_asr.yaml
new file mode 120000
index 00000000000..f3f59d5ac2b
--- /dev/null
+++ b/egs2/dsing/asr1/conf/decode_asr.yaml
@@ -0,0 +1 @@
+tuning/decode_transformer.yaml
\ No newline at end of file
diff --git a/egs2/dsing/asr1/conf/fbank.conf b/egs2/dsing/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/dsing/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/dsing/asr1/conf/pbs.conf b/egs2/dsing/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/dsing/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/dsing/asr1/conf/pitch.conf b/egs2/dsing/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/dsing/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/dsing/asr1/conf/queue.conf b/egs2/dsing/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/dsing/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/dsing/asr1/conf/slurm.conf b/egs2/dsing/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/dsing/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/dsing/asr1/conf/train_asr.yaml b/egs2/dsing/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..00ea2f24791
--- /dev/null
+++ b/egs2/dsing/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+tuning/train_asr_transformer.yaml
\ No newline at end of file
diff --git a/egs2/dsing/asr1/conf/train_lm.yaml b/egs2/dsing/asr1/conf/train_lm.yaml
new file mode 100644
index 00000000000..5bc7b7fcbfc
--- /dev/null
+++ b/egs2/dsing/asr1/conf/train_lm.yaml
@@ -0,0 +1,14 @@
+lm_conf:
+    nlayers: 2
+    unit: 650
+optim: sgd        # or adam
+batch_type: folded
+batch_size: 64   # batch size in LM training
+max_epoch: 20      # if the data size is large, we can reduce this
+patience: 3
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
diff --git a/egs2/dsing/asr1/conf/tuning/decode_conformer.yaml b/egs2/dsing/asr1/conf/tuning/decode_conformer.yaml
new file mode 100644
index 00000000000..d89db079882
--- /dev/null
+++ b/egs2/dsing/asr1/conf/tuning/decode_conformer.yaml
@@ -0,0 +1,7 @@
+batch_size: 1
+beam_size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.5
+lm_weight: 0.3
diff --git a/egs2/dsing/asr1/conf/tuning/decode_rnn.yaml b/egs2/dsing/asr1/conf/tuning/decode_rnn.yaml
new file mode 100644
index 00000000000..169f05a1ad9
--- /dev/null
+++ b/egs2/dsing/asr1/conf/tuning/decode_rnn.yaml
@@ -0,0 +1,6 @@
+lm_weight: 0.3
+beam_size: 20
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.6
diff --git a/egs2/dsing/asr1/conf/tuning/decode_transformer.yaml b/egs2/dsing/asr1/conf/tuning/decode_transformer.yaml
new file mode 100644
index 00000000000..d89db079882
--- /dev/null
+++ b/egs2/dsing/asr1/conf/tuning/decode_transformer.yaml
@@ -0,0 +1,7 @@
+batch_size: 1
+beam_size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.5
+lm_weight: 0.3
diff --git a/egs2/dsing/asr1/conf/tuning/train_asr_conformer6.yaml b/egs2/dsing/asr1/conf/tuning/train_asr_conformer6.yaml
new file mode 100644
index 00000000000..4ec26c01907
--- /dev/null
+++ b/egs2/dsing/asr1/conf/tuning/train_asr_conformer6.yaml
@@ -0,0 +1,79 @@
+# network architecture
+
+# frontend related
+frontend: default
+frontend_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 160
+
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: embed
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# optimization related
+optim: adam
+accum_grad: 1
+grad_clip: 3
+max_epoch: 50
+optim_conf:
+    lr: 4.0
+scheduler: noamlr
+scheduler_conf:
+    model_size: 256
+    warmup_steps: 25000
+
+# minibatch related
+batch_type: numel
+batch_bins: 10000000
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/dsing/asr1/conf/tuning/train_asr_rnn.yaml b/egs2/dsing/asr1/conf/tuning/train_asr_rnn.yaml
new file mode 100644
index 00000000000..f55ad9d72bd
--- /dev/null
+++ b/egs2/dsing/asr1/conf/tuning/train_asr_rnn.yaml
@@ -0,0 +1,35 @@
+# network architecture
+# encoder related
+encoder: vgg_rnn
+encoder_conf:
+    rnn_type: lstm     # encoder architecture type
+    bidirectional: True
+    use_projection: True
+    num_layers: 4
+    hidden_size: 1024
+    output_size: 1024
+
+# decoder related
+decoder: rnn
+decoder_conf:
+    num_layers: 2
+    hidden_size: 1024
+    sampling_probability: 0
+    att_conf:
+        atype: location
+        adim: 1024
+        aconv_chans: 10
+        aconv_filts: 100
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.5
+
+# minibatch related
+batch_size: 30
+
+# optimization related
+optim: adadelta
+max_epoch: 15
+patience: 3
+
diff --git a/egs2/dsing/asr1/conf/tuning/train_asr_transformer.yaml b/egs2/dsing/asr1/conf/tuning/train_asr_transformer.yaml
new file mode 100644
index 00000000000..545cd8a8333
--- /dev/null
+++ b/egs2/dsing/asr1/conf/tuning/train_asr_transformer.yaml
@@ -0,0 +1,51 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: "conv2d"
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+
+# decoder related
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: "embed"
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# minibatch related
+batch_type: folded
+batch_size: 32
+
+# optimization related
+optim: adam
+accum_grad: 2
+grad_clip: 5
+patience: 15
+max_epoch: 100
+optim_conf:
+    lr: 1.0
+scheduler: noamlr
+scheduler_conf:
+    warmup_steps: 25000
+
+# others:
+best_model_criterion:
+-  - valid
+   - acc
+   - max
+keep_nbest_models: 10
+
+init: xavier_uniform
diff --git a/egs2/dsing/asr1/db.sh b/egs2/dsing/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/dsing/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/dsing/asr1/local/data.sh b/egs2/dsing/asr1/local/data.sh
new file mode 100644
index 00000000000..26c61801e5f
--- /dev/null
+++ b/egs2/dsing/asr1/local/data.sh
@@ -0,0 +1,65 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Carnegie Mellon  University (Jiatong Shi)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+# general configuration
+stage=0       # start from 0 if you need to start from data preparation
+stop_stage=100
+SECONDS=0
+dsing=1
+
+ . utils/parse_options.sh || exit 1;
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+mkdir -p ${DSING}
+if [ -z "${DSING}" ]; then
+    log "Fill the value of 'DSING' of db.sh"
+    exit 1
+fi
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train${dsing}
+train_dev=dev
+test_set="dev test"
+
+
+log "data preparation started"
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "Prepare stage1: Download data to ${DSING}"
+    echo "Please download the data at https://ccrma.stanford.edu/damp/"
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "Prepare stage2: segmentation setup for Dsing"
+    if [ -d "local/dsing_task" ]; then
+       echo "exist segmetation, skip git clone"
+    else
+        git clone https://github.com/groadabike/Kaldi-Dsing-task.git local/dsing_task
+    fi
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "Prepare stage3: dataset prepare"
+    for datadir in ${train_set} ${train_dev} ${test_set}; do
+        python local/data_prep.py data/ ${DSING}/sing_300x30x2 local/dsing_task/DSing\ Kaldi\ Recipe/dsing/s5/conf/${datadir}.json ${datadir}
+        utils/utt2spk_to_spk2utt.pl data/${datadir}/utt2spk > data/${datadir}/spk2utt
+        utils/fix_data_dir.sh data/${datadir}
+    done
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/dsing/asr1/local/data_prep.py b/egs2/dsing/asr1/local/data_prep.py
new file mode 100644
index 00000000000..98d82fe1259
--- /dev/null
+++ b/egs2/dsing/asr1/local/data_prep.py
@@ -0,0 +1,195 @@
+# Source from https://github.com/groadabike/Kaldi-Dsing-task
+
+import json
+import argparse
+from os.path import join, exists, isfile
+from os import makedirs, listdir
+import re
+import hashlib
+
+
+class DataSet:
+    def __init__(self, name, workspace, db_path):
+        self.segments = []
+        self.spk2gender = []
+        self.text = []
+        self.utt2spk = []
+        self.wavscp = []
+        self.workspace = join(workspace, name)
+        self.db_path = db_path
+
+    def add_utterance(self, utt, recording):
+
+        text = utt["text"]
+        arrangement, performance, country, gender, user = recording[:-4].split("-")
+
+        # the following mapping is necessary for errors in gender in country IN
+        insensitive_none = re.compile(re.escape("none"), re.IGNORECASE)
+
+        gender = insensitive_none.sub("", utt["gender"])
+        spk = "{}{}".format(
+            insensitive_none.sub("", gender).upper(), insensitive_none.sub("", user)
+        )
+
+        rec_id = recording[:-4]
+        utt_id = "{}-{}-{}-{}-{}-{:03}".format(
+            spk, arrangement, performance, country, gender.upper(), utt["index"]
+        )
+
+        start = utt["start"]
+        end = utt["end"]
+
+        wavpath = join(country, "{}{}".format(country, "Vocals"), recording)
+
+        self._add_segment(utt_id, rec_id, start, end)
+        self._add_spk2gender(spk, gender)
+        self._add_text(utt_id, text)
+        self._add_utt2spk(utt_id, spk)
+        self._add_wavscp(rec_id, wavpath)
+
+    def _add_segment(self, rec_id, utt_id, start, end):
+        self.segments.append("{} {} {:.3f} {:.3f}".format(rec_id, utt_id, start, end))
+
+    def _add_spk2gender(self, spk, gender):
+        self.spk2gender.append("{} {}".format(spk, gender))
+
+    def _add_text(self, utt_id, text):
+        self.text.append("{} {}".format(utt_id, text))
+
+    def _add_utt2spk(self, utt_id, spk):
+        self.utt2spk.append("{} {}".format(utt_id, spk))
+
+    def _add_wavscp(self, rec_id, wavpath):
+        # use ffmpeg or sox (default ffmepg)
+        self.wavscp.append(
+            "{} ffmpeg -i {}/{} -f wav -ar 16000 -ac 1 - | ".format(
+                rec_id, self.db_path, wavpath
+            )
+        )
+        # self.wavscp.append(
+        #     "{} sox {}/{} -G -t wav -r 16000 -c 1 - remix 1 | ".format(
+        #         rec_id, db_path, wavpath
+        #     )
+        #  )
+
+    def list2file(self, outfile, list_data):
+        list_data = list(set(list_data))
+        with open(outfile, "w") as f:
+            for line in list_data:
+                f.write("{}\n".format(line))
+
+    def save(self):
+        if not exists(self.workspace):
+            makedirs(self.workspace)
+        self.list2file(join(self.workspace, "spk2gender"), sorted(self.spk2gender))
+        self.list2file(join(self.workspace, "text"), sorted(self.text))
+        self.list2file(join(self.workspace, "wav.scp"), sorted(self.wavscp))
+        self.list2file(join(self.workspace, "utt2spk"), sorted(self.utt2spk))
+        self.list2file(join(self.workspace, "segments"), sorted(self.segments))
+
+
+def read_json(filepath):
+    try:  # Read the json
+        with open(filepath) as data_file:
+            data = json.load(data_file)
+    except json.decoder.JSONDecodeError:
+        # Json has an extra first line. Error when was created
+        data = []
+
+    return data
+
+
+def map_rec2chec(db_path, countries):
+    """
+    Method read all the original audio tracks and create a dict
+            {<checksum>: <recording>}
+    :param db_path: string, path to root of DAMP Sing!
+    :return: dict
+    """
+    rec2chec = {}
+    for country in countries:
+        recordings = [
+            f
+            for f in listdir(join(db_path, country, country + "Vocals"))
+            if f.endswith(".m4a")
+        ]
+        for record in recordings:
+            rec2chec[
+                hashlib.md5(
+                    open(
+                        join(db_path, country, country + "Vocals", record), "rb"
+                    ).read()
+                ).hexdigest()
+            ] = record
+
+    return rec2chec
+
+
+def main(args):
+    db_path = args.db_path
+    workspace = args.workspace
+    utts_path = args.utterances
+    dset = args.dset
+
+    countries = ["GB"]
+    countries += ["US", "AU"] if dset in ["train3", "train30"] else []
+    countries += (
+        [
+            "AE",
+            "AR",
+            "BR",
+            "CL",
+            "CN",
+            "DE",
+            "ES",
+            "FR",
+            "HU",
+            "ID",
+            "IN",
+            "IQ",
+            "IR",
+            "IT",
+            "JP",
+            "KR",
+            "MX",
+            "MY",
+            "NO",
+            "PH",
+            "PT",
+            "RU",
+            "SA",
+            "SG",
+            "TH",
+            "VN",
+            "ZA",
+        ]
+        if dset in ["train30"]
+        else []
+    )
+
+    performances = map_rec2chec(db_path, countries)
+    utterances = read_json(utts_path)
+    dataset = DataSet(dset, workspace, db_path)
+
+    for utt in utterances:
+        dataset.add_utterance(utt, performances[utt["wavfile"]])
+
+    dataset.save()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "workspace", type=str, help="Path where the output files will be saved"
+    )
+    parser.add_argument("db_path", type=str, help="Path to DAMP 300x30x2 database")
+    parser.add_argument(
+        "utterances",
+        type=str,
+        help="Path to utterance details in json format",
+        default="metadata.json",
+    )
+    parser.add_argument("dset", type=str, help="Name of the dataset")
+
+    args = parser.parse_args()
+    main(args)
diff --git a/egs2/dsing/asr1/local/path.sh b/egs2/dsing/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/dsing/asr1/path.sh b/egs2/dsing/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/dsing/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/dsing/asr1/pyscripts b/egs2/dsing/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/dsing/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/dsing/asr1/run.sh b/egs2/dsing/asr1/run.sh
new file mode 100755
index 00000000000..2ce6a7d68bb
--- /dev/null
+++ b/egs2/dsing/asr1/run.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+dsing=30  # Set: 1  for DSing1
+          #    3  for DSing3
+          #    30 for DSing30
+
+train_set=train${dsing}
+train_dev=dev
+test_set="dev test"
+
+
+asr_config=conf/train_asr.yaml
+lm_config=conf/train_lm.yaml
+inference_config=conf/decode_asr.yaml
+
+nbpe=500
+
+./asr.sh \
+    --ngpu 1 \
+    --stage 1 \
+    --stop_stage 100 \
+    --local_data_opts "--dsing ${dsing}" \
+    --use_lm true \
+    --lm_config "${lm_config}" \
+    --token_type bpe \
+    --nbpe $nbpe \
+    --feats_type raw \
+    --speed_perturb_factors "0.9 1.0 1.1" \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${train_dev}" \
+    --test_sets "${test_set}" \
+    --bpe_train_text "data/${train_set}/text" \
+    --lm_train_text "data/${train_set}/text" "$@"
diff --git a/egs2/dsing/asr1/scripts b/egs2/dsing/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/dsing/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/dsing/asr1/steps b/egs2/dsing/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/dsing/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/dsing/asr1/utils b/egs2/dsing/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/dsing/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/st1/RESULT.md b/egs2/fisher_callhome_spanish/st1/RESULT.md
index 3ab898204f4..6efdcb6d5ef 100644
--- a/egs2/fisher_callhome_spanish/st1/RESULT.md
+++ b/egs2/fisher_callhome_spanish/st1/RESULT.md
@@ -7,3 +7,9 @@
 | RNN (char) [[Weiss et al.]](https://arxiv.org/abs/1703.08581) | 48.3       | 49.1        | 48.7        | 16.8             | 17.4             |
 | Transformer (BPE1k(500ES,500EN)) + ASR-PT + SpecAugment       | 48.4       | 49.5        | 48.6        | 19.7             | 19.6             |
 | Conformer (BPE1k(500ES,500EN)) + ASR-PT + SpecAugment         | **51.8**   | **52.3**    | **50.5**    | **22.3**         | **21.7**         |
+
+# Summary (4-gram BLEU, no callhome training)
+
+| model                                                         | fisher_dev | fisher_dev2 | fisher_test | callhome_devtest | callhome_evltest |
+| ------------------------------------------------------------- | ---------- | ----------- | ----------- | ---------------- | ---------------- |
+| Transformer (BPE1k(500ES,500EN)) + SpecAugment                | 44.7       | 45.6        | 45.1        | 17.3             | 16.8             |
\ No newline at end of file
diff --git a/egs2/how2/asr1/cmd.sh b/egs2/how2/asr1/cmd.sh
deleted file mode 100644
index 2aae6919fef..00000000000
--- a/egs2/how2/asr1/cmd.sh
+++ /dev/null
@@ -1,110 +0,0 @@
-# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
-# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
-# e.g.
-#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
-#
-# Options:
-#   --time <time>: Limit the maximum time to execute.
-#   --mem <mem>: Limit the maximum memory usage.
-#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
-#   --num-threads <ngpu>: Specify the number of CPU core.
-#   --gpu <ngpu>: Specify the number of GPU devices.
-#   --config: Change the configuration file from default.
-#
-# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
-# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
-# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
-# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
-#
-# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
-# These options are mapping to specific options for each backend and
-# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
-# If jobs failed, your configuration might be wrong for your environment.
-#
-#
-# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
-#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
-# =========================================================~
-
-
-# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
-cmd_backend='local'
-
-# Local machine, without any Job scheduling system
-if [ "${cmd_backend}" = local ]; then
-
-    # The other usage
-    export train_cmd="run.pl"
-    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
-    export cuda_cmd="run.pl"
-    # Used for "*_recog.py"
-    export decode_cmd="run.pl"
-
-# Local machine logging to stdout and log file, without any Job scheduling system
-elif [ "${cmd_backend}" = stdout ]; then
-
-    # The other usage
-    export train_cmd="stdout.pl"
-    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
-    export cuda_cmd="stdout.pl"
-    # Used for "*_recog.py"
-    export decode_cmd="stdout.pl"
-
-
-# "qsub" (Sun Grid Engine, or derivation of it)
-elif [ "${cmd_backend}" = sge ]; then
-    # The default setting is written in conf/queue.conf.
-    # You must change "-q g.q" for the "queue" for your environment.
-    # To know the "queue" names, type "qhost -q"
-    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
-
-    export train_cmd="queue.pl"
-    export cuda_cmd="queue.pl"
-    export decode_cmd="queue.pl"
-
-
-# "qsub" (Torque/PBS.)
-elif [ "${cmd_backend}" = pbs ]; then
-    # The default setting is written in conf/pbs.conf.
-
-    export train_cmd="pbs.pl"
-    export cuda_cmd="pbs.pl"
-    export decode_cmd="pbs.pl"
-
-
-# "sbatch" (Slurm)
-elif [ "${cmd_backend}" = slurm ]; then
-    # The default setting is written in conf/slurm.conf.
-    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
-    # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
-    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
-
-    export train_cmd="slurm.pl"
-    export cuda_cmd="slurm.pl"
-    export decode_cmd="slurm.pl"
-
-elif [ "${cmd_backend}" = ssh ]; then
-    # You have to create ".queue/machines" to specify the host to execute jobs.
-    # e.g. .queue/machines
-    #   host1
-    #   host2
-    #   host3
-    # Assuming you can login them without any password, i.e. You have to set ssh keys.
-
-    export train_cmd="ssh.pl"
-    export cuda_cmd="ssh.pl"
-    export decode_cmd="ssh.pl"
-
-# This is an example of specifying several unique options in the JHU CLSP cluster setup.
-# Users can modify/add their own command options according to their cluster environments.
-elif [ "${cmd_backend}" = jhu ]; then
-
-    export train_cmd="queue.pl --mem 2G"
-    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
-    export decode_cmd="queue.pl --mem 4G"
-
-else
-    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
-    return 1
-fi
diff --git a/egs2/how2/asr1/cmd.sh b/egs2/how2/asr1/cmd.sh
new file mode 120000
index 00000000000..f77e339f822
--- /dev/null
+++ b/egs2/how2/asr1/cmd.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/cmd.sh
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/README.md b/egs2/how2_2000h/asr1/README.md
new file mode 100644
index 00000000000..ce6fe0a19db
--- /dev/null
+++ b/egs2/how2_2000h/asr1/README.md
@@ -0,0 +1,28 @@
+## End to End Speech Recognition with How2-2000h
+
+
+HowTo 2000h fbank-pitch features have been released to enable reproduction of this recipe. 
+
+
+# Results on ASR
+
+
+## asr_base_conformer_lf_mix
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_model_valid.acc.best/dev5_test|3016|55215|93.1|4.8|2.1|1.9|8.8|56.7|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_model_valid.acc.best/dev5_test|3016|276377|97.1|1.1|1.9|1.9|4.8|56.7|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_model_valid.acc.best/dev5_test|3016|82484|94.1|3.5|2.4|2.2|8.0|56.7|
diff --git a/egs2/how2_2000h/asr1/asr.sh b/egs2/how2_2000h/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/how2_2000h/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/cmd.sh b/egs2/how2_2000h/asr1/cmd.sh
new file mode 120000
index 00000000000..f77e339f822
--- /dev/null
+++ b/egs2/how2_2000h/asr1/cmd.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/cmd.sh
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/conf/decode_asr.yaml b/egs2/how2_2000h/asr1/conf/decode_asr.yaml
new file mode 120000
index 00000000000..fa4714a6f43
--- /dev/null
+++ b/egs2/how2_2000h/asr1/conf/decode_asr.yaml
@@ -0,0 +1 @@
+tuning/decode_ctc.yaml
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/conf/fbank.conf b/egs2/how2_2000h/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/how2_2000h/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/how2_2000h/asr1/conf/pbs.conf b/egs2/how2_2000h/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/how2_2000h/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/how2_2000h/asr1/conf/pitch.conf b/egs2/how2_2000h/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/how2_2000h/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/how2_2000h/asr1/conf/queue.conf b/egs2/how2_2000h/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/how2_2000h/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/how2_2000h/asr1/conf/slurm.conf b/egs2/how2_2000h/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/how2_2000h/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/how2_2000h/asr1/conf/train_asr_conformer_lf.yaml b/egs2/how2_2000h/asr1/conf/train_asr_conformer_lf.yaml
new file mode 120000
index 00000000000..ee7d1d03dbc
--- /dev/null
+++ b/egs2/how2_2000h/asr1/conf/train_asr_conformer_lf.yaml
@@ -0,0 +1 @@
+tuning/train_asr_conformer_vid_ctc_lf.yaml
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/conf/tuning/decode.yaml b/egs2/how2_2000h/asr1/conf/tuning/decode.yaml
new file mode 100644
index 00000000000..519477f7a43
--- /dev/null
+++ b/egs2/how2_2000h/asr1/conf/tuning/decode.yaml
@@ -0,0 +1,7 @@
+beam_size: 4
+batch_size: 1
+penalty: 0.0
+minlenratio: 0.0
+maxlenratio: 0.0
+ctc_weight: 0.3
+lm_weight: 0.0
diff --git a/egs2/how2_2000h/asr1/conf/tuning/decode_ctc.yaml b/egs2/how2_2000h/asr1/conf/tuning/decode_ctc.yaml
new file mode 100755
index 00000000000..03fdd93249f
--- /dev/null
+++ b/egs2/how2_2000h/asr1/conf/tuning/decode_ctc.yaml
@@ -0,0 +1,7 @@
+beam_size: 20
+batch_size: 1
+penalty: 0.1
+minlenratio: 0.0
+maxlenratio: 0.0
+ctc_weight: 1.0
+lm_weight: 0.0
diff --git a/egs2/how2_2000h/asr1/conf/tuning/pretrain_hubert.yaml b/egs2/how2_2000h/asr1/conf/tuning/pretrain_hubert.yaml
new file mode 100644
index 00000000000..f21213f3421
--- /dev/null
+++ b/egs2/how2_2000h/asr1/conf/tuning/pretrain_hubert.yaml
@@ -0,0 +1,58 @@
+grad_clip: 5.0
+batch_type: numel
+batch_bins: 150000000
+accum_grad: 1
+max_epoch: 400
+patience: none
+# Use self-defined function for initialization
+init: xavier_uniform 
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+input_size: 768
+encoder: avhubert_pretrain
+encoder_conf:
+    output_size: 768
+    linear_units: 3072
+    attention_heads: 8
+    num_blocks: 12
+    dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    dropout_input: 0.1
+    dropout_features: 0.1
+    skip_masked: false
+    skip_nomask: false
+    mask_prob: 0.80
+    extractor_mode: default
+    conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
+    final_dim: 256
+    encoder_layerdrop: 0.05
+    feature_grad_mult: 0.1
+    untie_final_proj: true
+    label_rate: 100
+    sample_rate: 16000
+
+model_conf:
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    pred_masked_weight: 1.0
+    pred_nomask_weight: 0.0
+    loss_weights: 10.0
+
+optim: adam
+optim_conf:
+    lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 25000
+
+unused_parameters: true
+
+frontend: null
+
+normalize: null
+
+specaug: null
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/conf/tuning/train_asr_conformer_vid_ctc_lf.yaml b/egs2/how2_2000h/asr1/conf/tuning/train_asr_conformer_vid_ctc_lf.yaml
new file mode 100644
index 00000000000..606081aa9b1
--- /dev/null
+++ b/egs2/how2_2000h/asr1/conf/tuning/train_asr_conformer_vid_ctc_lf.yaml
@@ -0,0 +1,78 @@
+# This configuration requires Tesla V100-SXM2(32GB) x 8 GPUs It takes about 4 days.
+batch_type: length
+batch_bins: 60000000
+accum_grad: 10
+max_epoch: 100
+patience: none
+init: xavier_uniform 
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+log_interval: 5000
+encoder: longformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "abs_pos"
+    selfattention_layer_type: "lf_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+    attention_windows: [20,20,20,20,20,20,40,40,40,40,40,40]
+    attention_dilation: [1,1,1,1,1,1,1,1,1,1,1,1]
+    attention_mode: tvm
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 512
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.002
+scheduler: warmuplr
+scheduler_conf: 
+        warmup_steps: 25000
+        
+ctc_conf:
+        ignore_nan_grad: true
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+unused_parameters: True 
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/how2_2000h/asr1/db.sh b/egs2/how2_2000h/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/how2_2000h/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/local/data.sh b/egs2/how2_2000h/asr1/local/data.sh
new file mode 100755
index 00000000000..ffd918d7d09
--- /dev/null
+++ b/egs2/how2_2000h/asr1/local/data.sh
@@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+stage=0
+stop_stage=1
+
+. ./db.sh
+. ./path.sh
+. ./cmd.sh
+
+url_how2_2000="https://drive.google.com/file/d/1SHg7La_hflMTIm6gaCus46sn4zYqWJvb/view?usp=sharing"
+data_how2=how2_feats
+
+log "$0 $*"
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+. ./path.sh
+. ./cmd.sh
+
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "stage 0: Data download"
+
+    if [ -d ${data_how2} ]; then
+        log "$0: HowTo directory or archive already exists in ${data_how2}. Skipping download."
+    else
+        ../../../utils/download_from_google_drive.sh ${url_how2_2000} $PWD tar.gz
+        log "$0: Successfully downloaded and un-tarred how2_feats.tar.gz"
+    fi
+fi
+
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: Data preparation and verification"
+    mv how2_feats/data .
+    mv how2_feats/fbank .
+fi 
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/how2_2000h/asr1/local/data_normalization/nlsyms b/egs2/how2_2000h/asr1/local/data_normalization/nlsyms
new file mode 100644
index 00000000000..8497d1e0046
--- /dev/null
+++ b/egs2/how2_2000h/asr1/local/data_normalization/nlsyms
@@ -0,0 +1,2 @@
+(h|H)(m|M)+ hesmark
+(U|u)(m|M)+ hesmark
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/local/data_normalization/shortened b/egs2/how2_2000h/asr1/local/data_normalization/shortened
new file mode 100644
index 00000000000..305228debf3
--- /dev/null
+++ b/egs2/how2_2000h/asr1/local/data_normalization/shortened
@@ -0,0 +1,4 @@
+Dr. Doctor
+Mr. Mister
+Ms. Miss
+No.1 Number one
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/local/data_normalization/symbols b/egs2/how2_2000h/asr1/local/data_normalization/symbols
new file mode 100644
index 00000000000..915b7a23070
--- /dev/null
+++ b/egs2/how2_2000h/asr1/local/data_normalization/symbols
@@ -0,0 +1,6 @@
+¾ 3/4
+½ 1/2
+% percent
+[+] plus
+= equal
+&[^;]*; and
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/local/data_normalization/url b/egs2/how2_2000h/asr1/local/data_normalization/url
new file mode 100644
index 00000000000..ca215839a83
--- /dev/null
+++ b/egs2/how2_2000h/asr1/local/data_normalization/url
@@ -0,0 +1,5 @@
+@ at
+www[.] www dot
+[.]com[/] dot com slash
+[.]com dot com
+[.]org dot org
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/local/path.sh b/egs2/how2_2000h/asr1/local/path.sh
new file mode 100755
index 00000000000..a0b8041dfb2
--- /dev/null
+++ b/egs2/how2_2000h/asr1/local/path.sh
@@ -0,0 +1,19 @@
+# check extra module installation
+if ! python -c 'import longformer; import nlgeval; import datasets' > /dev/null; then
+    echo "Error: it seems that longformer is not installed." >&2
+    echo "Error: please install longformer as follows." >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make longformer.done" >&2
+    return 1
+fi
+if ! python -c 'import nlgeval' > /dev/null; then
+    echo "Error: it seems that nlgeval is not installed." >&2
+    echo "Error: please install nlgeval as follows." >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make longformer.done" >&2
+    return 1
+fi
+if ! python -c 'import datasets' > /dev/null; then
+    echo "Error: it seems that datasets is not installed." >&2
+    echo "Error: please install datasets as follows." >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make longformer.done" >&2
+    return 1
+fi
diff --git a/egs2/how2_2000h/asr1/path.sh b/egs2/how2_2000h/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/how2_2000h/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/pyscripts b/egs2/how2_2000h/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/how2_2000h/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/run.sh b/egs2/how2_2000h/asr1/run.sh
new file mode 100755
index 00000000000..7ff75326ed3
--- /dev/null
+++ b/egs2/how2_2000h/asr1/run.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+train_set="tr_2000h_utt"
+valid_set="cv05_utt"
+test_sets="dev5_test_utt"
+
+asr_config=conf/train_asr_conformer_lf.yaml
+inference_config=conf/decode_asr.yaml
+
+feats_type=extracted
+
+token_type=bpe
+
+nlsyms=data/nlsyms
+nbpe=1000
+bpe_nlsyms="[hes]"
+
+use_lm=false
+
+
+
+./asr.sh \
+    --lang en \
+    --feats_type ${feats_type} \
+    --token_type ${token_type} \
+    --nbpe ${nbpe} \
+    --nlsyms_txt ${nlsyms} \
+    --bpe_nlsyms ${bpe_nlsyms} \
+    --use_lm ${use_lm} \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --bpe_train_text "data/${train_set}/text" "$@"
diff --git a/egs2/how2_2000h/asr1/scripts b/egs2/how2_2000h/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/how2_2000h/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/steps b/egs2/how2_2000h/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/how2_2000h/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/utils b/egs2/how2_2000h/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/how2_2000h/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/README.md b/egs2/how2_2000h/sum1/README.md
new file mode 100644
index 00000000000..c95baf49e2c
--- /dev/null
+++ b/egs2/how2_2000h/sum1/README.md
@@ -0,0 +1,70 @@
+## End to End Speech Summarization
+
+This recipe can be used to build E2E Speech Summarization models using restricted self-attention on the HowTo corpus of instructional videos. 
+
+HowTo 2000h fbank-pitch features have been released to enable reproduction of this recipe. 
+
+Training is done in two stages, (a) ASR Pretraining, and (b) Summarization fine-tuning
+
+First run ASR pretraining as follows:
+The recipe is based on asr1
+```bash
+local/run_asr.sh --asr_tag asr_pretrain
+``` 
+Then run the finetuning on summarization using the previously trained model as the initialization
+
+```bash
+./run.sh --asr_tag sum_finetune --asr_args "--init_param exp/asr_asr_pretrain/valid.acc.ave_10best.pth:::ctc"
+```
+
+# Results on ASR
+
+
+## asr_base_conformer_lf_mix
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_model_valid.acc.best/dev5_test|3016|55215|93.1|4.8|2.1|1.9|8.8|56.7|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_model_valid.acc.best/dev5_test|3016|276377|97.1|1.1|1.9|1.9|4.8|56.7|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_model_valid.acc.best/dev5_test|3016|82484|94.1|3.5|2.4|2.2|8.0|56.7|
+
+
+
+# Results on Summarization
+
+## asr_ft_sum
+### SUMM
+- Model link: [huggingface](https://huggingface.co/espnet/roshansh_how2_asr_raw_ft_sum_valid.acc)
+- ASR config: [./conf/train_sum_conformer_lf.yaml](./conf/train_sum_conformer_lf.yaml)
+- Inference config: [./conf/decode_sum.yaml](./conf/decode_sum.yaml)
+
+|dataset|Snt|Wrd|ROUGE-1|ROUGE-2|ROUGE-L|METEOR|BERTScore|
+|---|---|---|---|---|---|---|---|
+|decode_sum_asr_model_valid.acc.best/dev5_test_sum|2127|69795|60.72|44.7|56.1|29.36|91.53|
+
+
+
+Please cite the following paper if you use this recipe:
+```Bibtex
+@misc{sharma2022speech,
+      title={Speech Summarization using Restricted Self-Attention}, 
+      author={Roshan Sharma and Shruti Palaskar and Alan W Black and Florian Metze},
+      year={2022},
+      eprint={2110.06263},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+
+```
diff --git a/egs2/how2_2000h/sum1/asr.sh b/egs2/how2_2000h/sum1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/how2_2000h/sum1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/cmd.sh b/egs2/how2_2000h/sum1/cmd.sh
new file mode 120000
index 00000000000..f77e339f822
--- /dev/null
+++ b/egs2/how2_2000h/sum1/cmd.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/cmd.sh
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/conf/decode_asr.yaml b/egs2/how2_2000h/sum1/conf/decode_asr.yaml
new file mode 120000
index 00000000000..fa4714a6f43
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/decode_asr.yaml
@@ -0,0 +1 @@
+tuning/decode_ctc.yaml
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/conf/decode_sum.yaml b/egs2/how2_2000h/sum1/conf/decode_sum.yaml
new file mode 120000
index 00000000000..27c573f341b
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/decode_sum.yaml
@@ -0,0 +1 @@
+tuning/decode_sum.yaml
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/conf/fbank.conf b/egs2/how2_2000h/sum1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/how2_2000h/sum1/conf/pbs.conf b/egs2/how2_2000h/sum1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/how2_2000h/sum1/conf/pitch.conf b/egs2/how2_2000h/sum1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/how2_2000h/sum1/conf/queue.conf b/egs2/how2_2000h/sum1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/how2_2000h/sum1/conf/slurm.conf b/egs2/how2_2000h/sum1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/how2_2000h/sum1/conf/train_asr_conformer_lf.yaml b/egs2/how2_2000h/sum1/conf/train_asr_conformer_lf.yaml
new file mode 120000
index 00000000000..ee7d1d03dbc
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/train_asr_conformer_lf.yaml
@@ -0,0 +1 @@
+tuning/train_asr_conformer_vid_ctc_lf.yaml
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/conf/train_sum_conformer_lf.yaml b/egs2/how2_2000h/sum1/conf/train_sum_conformer_lf.yaml
new file mode 120000
index 00000000000..ba6ab56ca56
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/train_sum_conformer_lf.yaml
@@ -0,0 +1 @@
+tuning/train_asr_conformer_vid_lf.yaml
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/conf/tuning/decode.yaml b/egs2/how2_2000h/sum1/conf/tuning/decode.yaml
new file mode 100644
index 00000000000..519477f7a43
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/tuning/decode.yaml
@@ -0,0 +1,7 @@
+beam_size: 4
+batch_size: 1
+penalty: 0.0
+minlenratio: 0.0
+maxlenratio: 0.0
+ctc_weight: 0.3
+lm_weight: 0.0
diff --git a/egs2/how2_2000h/sum1/conf/tuning/decode_ctc.yaml b/egs2/how2_2000h/sum1/conf/tuning/decode_ctc.yaml
new file mode 100755
index 00000000000..03fdd93249f
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/tuning/decode_ctc.yaml
@@ -0,0 +1,7 @@
+beam_size: 20
+batch_size: 1
+penalty: 0.1
+minlenratio: 0.0
+maxlenratio: 0.0
+ctc_weight: 1.0
+lm_weight: 0.0
diff --git a/egs2/how2_2000h/sum1/conf/tuning/decode_sum.yaml b/egs2/how2_2000h/sum1/conf/tuning/decode_sum.yaml
new file mode 100755
index 00000000000..4682af74153
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/tuning/decode_sum.yaml
@@ -0,0 +1,7 @@
+beam_size: 4
+batch_size: 1
+penalty: 0.1
+minlenratio: 0.01
+maxlenratio: 0.2
+ctc_weight: 0.0
+lm_weight: 0.0
diff --git a/egs2/how2_2000h/sum1/conf/tuning/pretrain_hubert.yaml b/egs2/how2_2000h/sum1/conf/tuning/pretrain_hubert.yaml
new file mode 100644
index 00000000000..f21213f3421
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/tuning/pretrain_hubert.yaml
@@ -0,0 +1,58 @@
+grad_clip: 5.0
+batch_type: numel
+batch_bins: 150000000
+accum_grad: 1
+max_epoch: 400
+patience: none
+# Use self-defined function for initialization
+init: xavier_uniform 
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+input_size: 768
+encoder: avhubert_pretrain
+encoder_conf:
+    output_size: 768
+    linear_units: 3072
+    attention_heads: 8
+    num_blocks: 12
+    dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    dropout_input: 0.1
+    dropout_features: 0.1
+    skip_masked: false
+    skip_nomask: false
+    mask_prob: 0.80
+    extractor_mode: default
+    conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
+    final_dim: 256
+    encoder_layerdrop: 0.05
+    feature_grad_mult: 0.1
+    untie_final_proj: true
+    label_rate: 100
+    sample_rate: 16000
+
+model_conf:
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    pred_masked_weight: 1.0
+    pred_nomask_weight: 0.0
+    loss_weights: 10.0
+
+optim: adam
+optim_conf:
+    lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 25000
+
+unused_parameters: true
+
+frontend: null
+
+normalize: null
+
+specaug: null
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/conf/tuning/train_asr_conformer_vid_ctc_lf.yaml b/egs2/how2_2000h/sum1/conf/tuning/train_asr_conformer_vid_ctc_lf.yaml
new file mode 100644
index 00000000000..606081aa9b1
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/tuning/train_asr_conformer_vid_ctc_lf.yaml
@@ -0,0 +1,78 @@
+# This configuration requires Tesla V100-SXM2(32GB) x 8 GPUs It takes about 4 days.
+batch_type: length
+batch_bins: 60000000
+accum_grad: 10
+max_epoch: 100
+patience: none
+init: xavier_uniform 
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+log_interval: 5000
+encoder: longformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "abs_pos"
+    selfattention_layer_type: "lf_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+    attention_windows: [20,20,20,20,20,20,40,40,40,40,40,40]
+    attention_dilation: [1,1,1,1,1,1,1,1,1,1,1,1]
+    attention_mode: tvm
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 512
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.002
+scheduler: warmuplr
+scheduler_conf: 
+        warmup_steps: 25000
+        
+ctc_conf:
+        ignore_nan_grad: true
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+unused_parameters: True 
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/how2_2000h/sum1/conf/tuning/train_asr_conformer_vid_lf.yaml b/egs2/how2_2000h/sum1/conf/tuning/train_asr_conformer_vid_lf.yaml
new file mode 100644
index 00000000000..f0454207ee5
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/tuning/train_asr_conformer_vid_lf.yaml
@@ -0,0 +1,80 @@
+# This configuration requires Tesla V100-SXM2(32GB) x 8 GPUs It takes about 2 days.
+batch_bins: 200000
+batch_type: length
+accum_grad: 10
+max_epoch: 100
+patience: 10
+init: xavier_uniform
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+log_interval: 5000
+encoder: longformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d2
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "abs_pos"
+    selfattention_layer_type: "lf_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+    attention_windows: [40,40,40,40,40,40,40,40,60,60,60,60]
+    attention_dilation: [1,1,1,1,1,1,1,1,1,1,1,1]
+    attention_mode: tvm
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 512
+    num_blocks: 6
+    dropout_rate: 0.15
+    positional_dropout_rate: 0.15
+    self_attention_dropout_rate: 0.15
+    src_attention_dropout_rate: 0.15
+
+model_conf:
+    ctc_weight: 0.0
+    lsm_weight: 0.15
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.0001
+scheduler: reducelronplateau
+scheduler_conf:
+        mode: min
+        factor: 0.5
+        patience: 1
+    #scheduler: warmuplr
+    #scheduler_conf: 
+    #    warmup_steps: 25000
+        
+ctc_conf:
+        ignore_nan_grad: true
+
+unused_parameters: True 
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/how2_2000h/sum1/db.sh b/egs2/how2_2000h/sum1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/how2_2000h/sum1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/local/data.sh b/egs2/how2_2000h/sum1/local/data.sh
new file mode 100755
index 00000000000..ffd918d7d09
--- /dev/null
+++ b/egs2/how2_2000h/sum1/local/data.sh
@@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+stage=0
+stop_stage=1
+
+. ./db.sh
+. ./path.sh
+. ./cmd.sh
+
+url_how2_2000="https://drive.google.com/file/d/1SHg7La_hflMTIm6gaCus46sn4zYqWJvb/view?usp=sharing"
+data_how2=how2_feats
+
+log "$0 $*"
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+. ./path.sh
+. ./cmd.sh
+
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "stage 0: Data download"
+
+    if [ -d ${data_how2} ]; then
+        log "$0: HowTo directory or archive already exists in ${data_how2}. Skipping download."
+    else
+        ../../../utils/download_from_google_drive.sh ${url_how2_2000} $PWD tar.gz
+        log "$0: Successfully downloaded and un-tarred how2_feats.tar.gz"
+    fi
+fi
+
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: Data preparation and verification"
+    mv how2_feats/data .
+    mv how2_feats/fbank .
+fi 
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/how2_2000h/sum1/local/data_normalization/nlsyms b/egs2/how2_2000h/sum1/local/data_normalization/nlsyms
new file mode 100644
index 00000000000..8497d1e0046
--- /dev/null
+++ b/egs2/how2_2000h/sum1/local/data_normalization/nlsyms
@@ -0,0 +1,2 @@
+(h|H)(m|M)+ hesmark
+(U|u)(m|M)+ hesmark
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/local/data_normalization/shortened b/egs2/how2_2000h/sum1/local/data_normalization/shortened
new file mode 100644
index 00000000000..305228debf3
--- /dev/null
+++ b/egs2/how2_2000h/sum1/local/data_normalization/shortened
@@ -0,0 +1,4 @@
+Dr. Doctor
+Mr. Mister
+Ms. Miss
+No.1 Number one
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/local/data_normalization/symbols b/egs2/how2_2000h/sum1/local/data_normalization/symbols
new file mode 100644
index 00000000000..915b7a23070
--- /dev/null
+++ b/egs2/how2_2000h/sum1/local/data_normalization/symbols
@@ -0,0 +1,6 @@
+¾ 3/4
+½ 1/2
+% percent
+[+] plus
+= equal
+&[^;]*; and
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/local/data_normalization/url b/egs2/how2_2000h/sum1/local/data_normalization/url
new file mode 100644
index 00000000000..ca215839a83
--- /dev/null
+++ b/egs2/how2_2000h/sum1/local/data_normalization/url
@@ -0,0 +1,5 @@
+@ at
+www[.] www dot
+[.]com[/] dot com slash
+[.]com dot com
+[.]org dot org
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/local/path.sh b/egs2/how2_2000h/sum1/local/path.sh
new file mode 100755
index 00000000000..a0b8041dfb2
--- /dev/null
+++ b/egs2/how2_2000h/sum1/local/path.sh
@@ -0,0 +1,19 @@
+# check extra module installation
+if ! python -c 'import longformer; import nlgeval; import datasets' > /dev/null; then
+    echo "Error: it seems that longformer is not installed." >&2
+    echo "Error: please install longformer as follows." >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make longformer.done" >&2
+    return 1
+fi
+if ! python -c 'import nlgeval' > /dev/null; then
+    echo "Error: it seems that nlgeval is not installed." >&2
+    echo "Error: please install nlgeval as follows." >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make longformer.done" >&2
+    return 1
+fi
+if ! python -c 'import datasets' > /dev/null; then
+    echo "Error: it seems that datasets is not installed." >&2
+    echo "Error: please install datasets as follows." >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make longformer.done" >&2
+    return 1
+fi
diff --git a/egs2/how2_2000h/sum1/local/run_asr.sh b/egs2/how2_2000h/sum1/local/run_asr.sh
new file mode 120000
index 00000000000..8d5b78f2cf0
--- /dev/null
+++ b/egs2/how2_2000h/sum1/local/run_asr.sh
@@ -0,0 +1 @@
+../../asr1/run.sh
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/local/score.sh b/egs2/how2_2000h/sum1/local/score.sh
new file mode 100755
index 00000000000..da549ebcc62
--- /dev/null
+++ b/egs2/how2_2000h/sum1/local/score.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+# Copyright 2021 Carnegie Mellon University (Author : Roshan Sharma)
+
+## begin configuration section.
+data=data/dev5_test_sum
+# end configuration section.
+
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+
+if [ $# -lt 1 ]; then
+  echo "Usage: local/score.sh <asr-exp-dir>"
+  exit 1;
+fi
+
+
+asr_expdir=$1
+
+name=$(basename ${data}) # e.g. dev5_test
+echo "${asr_expdir}/decode_*/${name}"
+for dir in ${asr_expdir}/decode_*/${name}; do
+    python pyscripts/utils/score_summarization.py $data/text $dir/text $(echo $dir | sed 's/exp//g') > $dir/result.sum
+done   
diff --git a/egs2/how2_2000h/sum1/path.sh b/egs2/how2_2000h/sum1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/how2_2000h/sum1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/pyscripts b/egs2/how2_2000h/sum1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/how2_2000h/sum1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/run.sh b/egs2/how2_2000h/sum1/run.sh
new file mode 100755
index 00000000000..5acfc2abc59
--- /dev/null
+++ b/egs2/how2_2000h/sum1/run.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
+train_set="tr_2000h_sum"
+valid_set="cv05_sum"
+test_sets="dev5_test_sum"
+asr_config=conf/train_sum_conformer_lf.yaml
+inference_config=conf/decode_sum.yaml
+
+feats_type=extracted
+
+token_type=bpe
+
+nlsyms=data/nlsyms
+nbpe=1000
+bpe_nlsyms="[hes]"
+
+use_lm=false
+mdur=100
+
+## Run local/run_asr.sh to pretrain an ASR Model on How2, and fine-tune that model on Speech Summarization
+
+./asr.sh \
+    --lang en \
+    --feats_type ${feats_type} \
+    --token_type ${token_type} \
+    --nbpe ${nbpe} \
+    --nlsyms_txt ${nlsyms} \
+    --bpe_nlsyms ${bpe_nlsyms} \
+    --use_lm ${use_lm} \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --max_wav_duration "$mdur" \
+    --bpe_train_text "data/${train_set}/text" "$@"
diff --git a/egs2/how2_2000h/sum1/scripts b/egs2/how2_2000h/sum1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/how2_2000h/sum1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/steps b/egs2/how2_2000h/sum1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/how2_2000h/sum1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/utils b/egs2/how2_2000h/sum1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/how2_2000h/sum1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/librispeech/asr1/README.md b/egs2/librispeech/asr1/README.md
index 64a859f07b7..ddcb14fce05 100644
--- a/egs2/librispeech/asr1/README.md
+++ b/egs2/librispeech/asr1/README.md
@@ -112,6 +112,178 @@
 |decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_clean|2620|65818|97.7|1.6|0.7|0.4|2.7|25.7|
 |decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_other|2939|65101|94.5|3.9|1.5|1.0|6.4|45.1|
 
+
+# Conformer, `hop_length=160`
+- Params: 116.15 M
+- ASR config: [conf/tuning/train_asr_conformer10_hop_length160.yaml](conf/tuning/train_asr_conformer10_hop_length160.yaml)
+- LM config: [conf/tuning/train_lm_transformer2.yaml](conf/tuning/train_lm_transformer2.yaml)
+- Pretrained model: [https://huggingface.co/pyf98/librispeech_conformer_hop_length160](https://huggingface.co/pyf98/librispeech_conformer_hop_length160)
+
+# RESULTS
+## Environments
+- date: `Mon Mar 14 12:26:10 EDT 2022`
+- python version: `3.9.7 (default, Sep 16 2021, 13:09:58)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.10.1`
+- Git hash: `467660021998c416ac366aed0f75f3399e321a3a`
+  - Commit date: `Sun Mar 13 17:08:56 2022 -0400`
+
+## asr_train_asr_conformer10_hop_length160_raw_en_bpe5000_sp
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|beam60_ctc0.3/dev_clean|2703|54402|98.1|1.7|0.2|0.2|2.1|27.7|
+|beam60_ctc0.3/dev_other|2864|50948|95.3|4.3|0.4|0.5|5.2|44.1|
+|beam60_ctc0.3/test_clean|2620|52576|97.9|1.9|0.2|0.3|2.4|27.9|
+|beam60_ctc0.3/test_other|2939|52343|95.4|4.1|0.4|0.6|5.2|44.8|
+|beam60_ctc0.3_lm0.6/dev_clean|2703|54402|98.4|1.4|0.2|0.2|1.8|23.3|
+|beam60_ctc0.3_lm0.6/dev_other|2864|50948|96.4|3.2|0.4|0.4|3.9|36.2|
+|beam60_ctc0.3_lm0.6/test_clean|2620|52576|98.3|1.5|0.2|0.2|2.0|23.7|
+|beam60_ctc0.3_lm0.6/test_other|2939|52343|96.2|3.3|0.4|0.5|4.2|39.6|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|beam60_ctc0.3/dev_clean|2703|288456|99.5|0.3|0.2|0.2|0.7|27.7|
+|beam60_ctc0.3/dev_other|2864|265951|98.4|1.0|0.6|0.6|2.2|44.1|
+|beam60_ctc0.3/test_clean|2620|281530|99.4|0.3|0.3|0.2|0.8|27.9|
+|beam60_ctc0.3/test_other|2939|272758|98.5|0.9|0.7|0.6|2.1|44.8|
+|beam60_ctc0.3_lm0.6/dev_clean|2703|288456|99.5|0.2|0.2|0.2|0.6|23.3|
+|beam60_ctc0.3_lm0.6/dev_other|2864|265951|98.5|0.8|0.6|0.5|1.9|36.2|
+|beam60_ctc0.3_lm0.6/test_clean|2620|281530|99.5|0.2|0.3|0.2|0.7|23.7|
+|beam60_ctc0.3_lm0.6/test_other|2939|272758|98.6|0.7|0.7|0.5|1.9|39.6|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|beam60_ctc0.3/dev_clean|2703|68010|97.6|1.7|0.6|0.4|2.7|27.7|
+|beam60_ctc0.3/dev_other|2864|63110|94.2|4.3|1.5|0.9|6.7|44.1|
+|beam60_ctc0.3/test_clean|2620|65818|97.4|1.8|0.8|0.4|3.0|27.9|
+|beam60_ctc0.3/test_other|2939|65101|94.4|3.9|1.7|0.8|6.4|44.8|
+|beam60_ctc0.3_lm0.6/dev_clean|2703|68010|98.0|1.4|0.6|0.3|2.3|23.3|
+|beam60_ctc0.3_lm0.6/dev_other|2864|63110|95.2|3.4|1.4|0.6|5.5|36.2|
+|beam60_ctc0.3_lm0.6/test_clean|2620|65818|97.8|1.4|0.8|0.3|2.5|23.7|
+|beam60_ctc0.3_lm0.6/test_other|2939|65101|95.1|3.2|1.7|0.6|5.5|39.6|
+
+
+
+# Conformer, using stochastic depth
+- Params: 116.15M
+- ASR config [conf/tuning/train_asr_conformer9_layerdrop0.1_last6.yaml](conf/tuning/train_asr_conformer9_layerdrop0.1_last6.yaml)
+- LM config: [conf/tuning/train_lm_transformer2.yaml](conf/tuning/train_lm_transformer2.yaml)
+- Pretrained model: [https://huggingface.co/pyf98/librispeech_conformer_layerdrop0.1_last6](https://huggingface.co/pyf98/librispeech_conformer_layerdrop0.1_last6)
+
+# RESULTS
+## Environments
+- date: `Mon Mar  7 12:21:40 EST 2022`
+- python version: `3.9.7 (default, Sep 16 2021, 13:09:58)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.10.1`
+- Git hash: `c3569453a408fd4ff4173d9c1d2062c88d1fc060`
+  - Commit date: `Sun Mar 6 23:58:36 2022 -0500`
+
+## asr_train_asr_conformer9_layerdrop0.1_last6_raw_en_bpe5000_sp
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|beam60_ctc0.3/dev_clean|2703|54402|98.1|1.8|0.2|0.2|2.1|26.6|
+|beam60_ctc0.3/dev_other|2864|50948|95.4|4.2|0.4|0.5|5.1|43.3|
+|beam60_ctc0.3/test_clean|2620|52576|97.9|1.9|0.2|0.3|2.4|28.1|
+|beam60_ctc0.3/test_other|2939|52343|95.3|4.3|0.4|0.7|5.4|45.7|
+|beam60_ctc0.3_lm0.6/dev_clean|2703|54402|98.4|1.4|0.2|0.2|1.8|23.3|
+|beam60_ctc0.3_lm0.6/dev_other|2864|50948|96.4|3.2|0.4|0.4|4.0|36.5|
+|beam60_ctc0.3_lm0.6/test_clean|2620|52576|98.2|1.6|0.2|0.2|2.0|23.7|
+|beam60_ctc0.3_lm0.6/test_other|2939|52343|96.2|3.4|0.5|0.5|4.3|40.4|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|beam60_ctc0.3/dev_clean|2703|288456|99.5|0.3|0.2|0.2|0.7|26.6|
+|beam60_ctc0.3/dev_other|2864|265951|98.3|1.0|0.7|0.6|2.3|43.3|
+|beam60_ctc0.3/test_clean|2620|281530|99.5|0.3|0.3|0.2|0.8|28.1|
+|beam60_ctc0.3/test_other|2939|272758|98.4|1.0|0.7|0.6|2.3|45.7|
+|beam60_ctc0.3_lm0.6/dev_clean|2703|288456|99.5|0.3|0.3|0.2|0.7|23.3|
+|beam60_ctc0.3_lm0.6/dev_other|2864|265951|98.5|0.8|0.7|0.5|1.9|36.5|
+|beam60_ctc0.3_lm0.6/test_clean|2620|281530|99.5|0.2|0.3|0.2|0.7|23.7|
+|beam60_ctc0.3_lm0.6/test_other|2939|272758|98.5|0.7|0.7|0.5|2.0|40.4|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|beam60_ctc0.3/dev_clean|2703|68010|97.6|1.7|0.7|0.3|2.7|26.6|
+|beam60_ctc0.3/dev_other|2864|63110|94.2|4.3|1.5|0.8|6.6|43.3|
+|beam60_ctc0.3/test_clean|2620|65818|97.4|1.8|0.8|0.3|2.9|28.1|
+|beam60_ctc0.3/test_other|2939|65101|94.2|4.1|1.7|0.8|6.6|45.7|
+|beam60_ctc0.3_lm0.6/dev_clean|2703|68010|97.9|1.4|0.7|0.3|2.4|23.3|
+|beam60_ctc0.3_lm0.6/dev_other|2864|63110|95.2|3.4|1.5|0.6|5.5|36.5|
+|beam60_ctc0.3_lm0.6/test_clean|2620|65818|97.7|1.5|0.8|0.3|2.6|23.7|
+|beam60_ctc0.3_lm0.6/test_other|2939|65101|95.0|3.2|1.8|0.6|5.6|40.4|
+
+
+
+# Conformer, new SpecAug, using weight decay in Adam
+- Params: 116.15M
+- ASR config [conf/tuning/train_asr_conformer8.yaml](conf/tuning/train_asr_conformer8.yaml)
+- LM config: [conf/tuning/train_lm_transformer2.yaml](conf/tuning/train_lm_transformer2.yaml)
+- Pretrained model: [https://huggingface.co/pyf98/librispeech_conformer](https://huggingface.co/pyf98/librispeech_conformer)
+
+# RESULTS
+## Environments
+- date: `Mon Mar  7 12:26:10 EST 2022`
+- python version: `3.9.7 (default, Sep 16 2021, 13:09:58)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.10.1`
+- Git hash: `c3569453a408fd4ff4173d9c1d2062c88d1fc060`
+  - Commit date: `Sun Mar 6 23:58:36 2022 -0500`
+
+## asr_train_asr_conformer8_raw_en_bpe5000_sp
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|beam60_ctc0.3/dev_clean|2703|54402|98.1|1.8|0.2|0.2|2.1|27.3|
+|beam60_ctc0.3/dev_other|2864|50948|95.2|4.4|0.4|0.5|5.4|43.7|
+|beam60_ctc0.3/test_clean|2620|52576|97.9|1.9|0.2|0.3|2.3|29.0|
+|beam60_ctc0.3/test_other|2939|52343|95.2|4.3|0.4|0.6|5.4|45.7|
+|beam60_ctc0.3_lm0.6/dev_clean|2703|54402|98.4|1.4|0.2|0.2|1.8|23.5|
+|beam60_ctc0.3_lm0.6/dev_other|2864|50948|96.2|3.4|0.4|0.4|4.1|37.4|
+|beam60_ctc0.3_lm0.6/test_clean|2620|52576|98.3|1.5|0.2|0.2|1.9|24.1|
+|beam60_ctc0.3_lm0.6/test_other|2939|52343|96.2|3.3|0.5|0.5|4.3|39.9|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|beam60_ctc0.3/dev_clean|2703|288456|99.5|0.3|0.2|0.2|0.7|27.3|
+|beam60_ctc0.3/dev_other|2864|265951|98.2|1.1|0.7|0.6|2.4|43.7|
+|beam60_ctc0.3/test_clean|2620|281530|99.4|0.3|0.3|0.2|0.8|29.0|
+|beam60_ctc0.3/test_other|2939|272758|98.4|0.9|0.7|0.6|2.2|45.7|
+|beam60_ctc0.3_lm0.6/dev_clean|2703|288456|99.5|0.2|0.2|0.2|0.7|23.5|
+|beam60_ctc0.3_lm0.6/dev_other|2864|265951|98.5|0.9|0.7|0.5|2.0|37.4|
+|beam60_ctc0.3_lm0.6/test_clean|2620|281530|99.5|0.2|0.3|0.2|0.7|24.1|
+|beam60_ctc0.3_lm0.6/test_other|2939|272758|98.6|0.7|0.7|0.5|1.9|39.9|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|beam60_ctc0.3/dev_clean|2703|68010|97.6|1.8|0.7|0.3|2.8|27.3|
+|beam60_ctc0.3/dev_other|2864|63110|94.1|4.4|1.5|0.9|6.8|43.7|
+|beam60_ctc0.3/test_clean|2620|65818|97.4|1.8|0.7|0.3|2.9|29.0|
+|beam60_ctc0.3/test_other|2939|65101|94.2|4.1|1.7|0.8|6.6|45.7|
+|beam60_ctc0.3_lm0.6/dev_clean|2703|68010|97.9|1.5|0.7|0.3|2.4|23.5|
+|beam60_ctc0.3_lm0.6/dev_other|2864|63110|95.1|3.5|1.4|0.6|5.6|37.4|
+|beam60_ctc0.3_lm0.6/test_clean|2620|65818|97.7|1.5|0.8|0.3|2.5|24.1|
+|beam60_ctc0.3_lm0.6/test_other|2939|65101|95.1|3.2|1.7|0.6|5.5|39.9|
+
+
+
 # Tuning warmup_steps
 - Note
     - warmup_steps: 25000 -> 40000
diff --git a/egs2/librispeech/asr1/conf/decode_asr.yaml b/egs2/librispeech/asr1/conf/decode_asr.yaml
index fe5290e82d1..7b44351b5f2 100644
--- a/egs2/librispeech/asr1/conf/decode_asr.yaml
+++ b/egs2/librispeech/asr1/conf/decode_asr.yaml
@@ -1,3 +1,3 @@
-lm_weight: 0.6
-ctc_weight: 0.4
 beam_size: 60
+ctc_weight: 0.3
+lm_weight: 0.6
diff --git a/egs2/librispeech/asr1/conf/train_asr_confformer.yaml b/egs2/librispeech/asr1/conf/train_asr_confformer.yaml
deleted file mode 120000
index 2b1e07638c8..00000000000
--- a/egs2/librispeech/asr1/conf/train_asr_confformer.yaml
+++ /dev/null
@@ -1 +0,0 @@
-tuning/train_asr_conformer6_n_fft512_hop_length256.yaml
\ No newline at end of file
diff --git a/egs2/librispeech/asr1/conf/train_asr_conformer.yaml b/egs2/librispeech/asr1/conf/train_asr_conformer.yaml
new file mode 120000
index 00000000000..11b013a3089
--- /dev/null
+++ b/egs2/librispeech/asr1/conf/train_asr_conformer.yaml
@@ -0,0 +1 @@
+tuning/train_asr_conformer10_hop_length160.yaml
\ No newline at end of file
diff --git a/egs2/librispeech/asr1/conf/tuning/train_asr_conformer10_hop_length160.yaml b/egs2/librispeech/asr1/conf/tuning/train_asr_conformer10_hop_length160.yaml
new file mode 100644
index 00000000000..76094f0c4a9
--- /dev/null
+++ b/egs2/librispeech/asr1/conf/tuning/train_asr_conformer10_hop_length160.yaml
@@ -0,0 +1,76 @@
+# Trained with Tesla V100 (32GB) x 4 GPUs. It takes about 3.5 days.
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    rel_pos_type: latest
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    use_cnn_module: true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+frontend_conf:
+    n_fft: 512
+    hop_length: 160
+
+use_amp: true
+num_workers: 4
+batch_type: numel
+batch_bins: 35000000
+accum_grad: 4
+max_epoch: 50
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+    lr: 0.0025
+    weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 40000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 27
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_ratio_range:
+    - 0.
+    - 0.05
+    num_time_mask: 10
diff --git a/egs2/librispeech/asr1/conf/tuning/train_asr_conformer8.yaml b/egs2/librispeech/asr1/conf/tuning/train_asr_conformer8.yaml
new file mode 100644
index 00000000000..5ff37537086
--- /dev/null
+++ b/egs2/librispeech/asr1/conf/tuning/train_asr_conformer8.yaml
@@ -0,0 +1,76 @@
+# Trained with Tesla V100 (32GB) x 3 GPUs. It takes about 3 days.
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    rel_pos_type: latest
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    use_cnn_module: true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+frontend_conf:
+    n_fft: 512
+    hop_length: 256
+
+use_amp: true
+num_workers: 4
+batch_type: numel
+batch_bins: 35000000
+accum_grad: 4
+max_epoch: 50
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+    lr: 0.0025
+    weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 40000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 27
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_ratio_range:
+    - 0.
+    - 0.05
+    num_time_mask: 10
diff --git a/egs2/librispeech/asr1/conf/tuning/train_asr_conformer9_layerdrop0.1_last6.yaml b/egs2/librispeech/asr1/conf/tuning/train_asr_conformer9_layerdrop0.1_last6.yaml
new file mode 100644
index 00000000000..bb89c68826b
--- /dev/null
+++ b/egs2/librispeech/asr1/conf/tuning/train_asr_conformer9_layerdrop0.1_last6.yaml
@@ -0,0 +1,90 @@
+# Trained with Tesla V100 (32GB) x 3 GPUs. It takes about 3 days.
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    rel_pos_type: latest
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    use_cnn_module: true
+    cnn_module_kernel: 31
+    stochastic_depth_rate:
+    - 0.0
+    - 0.0
+    - 0.0
+    - 0.0
+    - 0.0
+    - 0.0
+    - 0.1
+    - 0.1
+    - 0.1
+    - 0.1
+    - 0.1
+    - 0.1
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+frontend_conf:
+    n_fft: 512
+    hop_length: 256
+
+unused_parameters: true         # due to layer dropout, some layers are not used
+use_amp: true
+num_workers: 4
+batch_type: numel
+batch_bins: 35000000
+accum_grad: 4
+max_epoch: 50
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+    lr: 0.0025
+    weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 40000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 27
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_ratio_range:
+    - 0.
+    - 0.05
+    num_time_mask: 10
diff --git a/egs2/librispeech/asr1/run.sh b/egs2/librispeech/asr1/run.sh
index 7035051a859..4a457e86a7d 100755
--- a/egs2/librispeech/asr1/run.sh
+++ b/egs2/librispeech/asr1/run.sh
@@ -9,13 +9,13 @@ train_set="train_960"
 valid_set="dev"
 test_sets="test_clean test_other dev_clean dev_other"
 
-asr_config=conf/tuning/train_asr_conformer7_n_fft512_hop_length256.yaml
+asr_config=conf/train_asr_conformer.yaml
 lm_config=conf/tuning/train_lm_transformer2.yaml
 inference_config=conf/decode_asr.yaml
 
 ./asr.sh \
     --lang en \
-    --ngpu 16 \
+    --ngpu 4 \
     --nbpe 5000 \
     --max_wav_duration 30 \
     --speed_perturb_factors "0.9 1.0 1.1" \
diff --git a/egs2/lrs3/asr1/RESULTS.md b/egs2/lrs3/asr1/RESULTS.md
new file mode 100644
index 00000000000..be579a0ee64
--- /dev/null
+++ b/egs2/lrs3/asr1/RESULTS.md
@@ -0,0 +1,32 @@
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+# RESULTS
+## Environments
+- date: `Mon Mar  7 16:57:48 EST 2022`
+- python version: `3.8.12 (default, Oct 12 2021, 13:49:34)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.10.1`
+- Git hash: `ce48b589cd2d04b00a867a24352fc8d45fc6afc9`
+  - Commit date: `Mon Mar 7 09:20:56 2022 -0500`
+
+## asr_train_asr_transformer_no_lm
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|inference_asr_model_valid.acc.ave/dev|2686|30060|81.8|15.2|3.0|4.0|22.2|75.3|
+|inference_asr_model_valid.acc.ave/test|1321|9890|90.0|8.9|1.1|1.9|11.9|46.6|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|inference_asr_model_valid.acc.ave/dev|2686|155720|91.2|4.5|4.3|4.0|12.8|75.3|
+|inference_asr_model_valid.acc.ave/test|1321|49750|95.2|2.7|2.1|1.7|6.5|46.6|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|inference_asr_model_valid.acc.ave/dev|2686|36737|77.1|13.2|9.7|2.9|25.8|75.3|
+|inference_asr_model_valid.acc.ave/test|1321|11831|86.5|8.0|5.5|1.3|14.7|46.6|
+
diff --git a/egs2/lrs3/asr1/asr.sh b/egs2/lrs3/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/lrs3/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/lrs3/asr1/cmd.sh b/egs2/lrs3/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/lrs3/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/lrs3/asr1/conf/fbank.conf b/egs2/lrs3/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/lrs3/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/lrs3/asr1/conf/pbs.conf b/egs2/lrs3/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/lrs3/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/lrs3/asr1/conf/pitch.conf b/egs2/lrs3/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/lrs3/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/lrs3/asr1/conf/queue.conf b/egs2/lrs3/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/lrs3/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/lrs3/asr1/conf/slurm.conf b/egs2/lrs3/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/lrs3/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/lrs3/asr1/conf/train_asr_transformer.yaml b/egs2/lrs3/asr1/conf/train_asr_transformer.yaml
new file mode 100644
index 00000000000..6b2da79b3d4
--- /dev/null
+++ b/egs2/lrs3/asr1/conf/train_asr_transformer.yaml
@@ -0,0 +1,84 @@
+encoder: conformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    rel_pos_type: latest
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    use_cnn_module: true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+frontend_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 160
+
+seed: 2022
+log_interval: 400   
+num_att_plot: 0     
+num_workers: 4      
+sort_in_batch: descending       # how to sort data in making batch
+sort_batch: descending          # how to sort created batches
+batch_type: numel
+batch_bins: 16000000
+accum_grad: 4
+max_epoch: 70
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+use_amp: true      
+cudnn_deterministic: false  
+cudnn_benchmark: false      
+
+optim: adam
+optim_conf:
+    lr: 0.002
+    weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 15000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 27
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_ratio_range:
+    - 0.
+    - 0.05
+    num_time_mask: 5
diff --git a/egs2/lrs3/asr1/conf/train_lm.yaml b/egs2/lrs3/asr1/conf/train_lm.yaml
new file mode 100644
index 00000000000..c9260fcbf12
--- /dev/null
+++ b/egs2/lrs3/asr1/conf/train_lm.yaml
@@ -0,0 +1,15 @@
+grad_clip: 5.0
+batch_type: folded
+batch_size: 512
+optim: adam
+optim_conf:
+    lr: 0.0005
+lm: seq_rnn
+lm_conf:
+    unit: 650
+    nlayers: 4
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
diff --git a/egs2/lrs3/asr1/db.sh b/egs2/lrs3/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/lrs3/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/lrs3/asr1/local/data.sh b/egs2/lrs3/asr1/local/data.sh
new file mode 100755
index 00000000000..954e04473b9
--- /dev/null
+++ b/egs2/lrs3/asr1/local/data.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+. ./db.sh
+. ./path.sh
+. ./cmd.sh
+
+download_and_untar=false
+mp4_to_wav=true
+
+# Manually fill the lrs3_username, lrs3_password
+lrs3_username=
+lrs3_password=
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 1
+fi
+
+if [ -z "${LRS3}" ]; then
+    log "Fill the value of 'LRS3' of db.sh"
+    exit 1
+fi
+
+if $download_and_untar; then
+    log "Downloading and Untarring the LRS3 with username ${lrs3_username} and passwoed ${lrs3_password}."
+    local/download_and_untar.sh --remove-archive ${LRS3} ${lrs3_username} ${lrs3_password}
+fi
+
+if $mp4_to_wav; then
+    log "Extacting .wav files from .mp4 files and storing it under the same directory"
+    local/mp4_to_wav.sh ${LRS3}
+fi
+
+# Make the Folders where ESPNet data-prep files will be stored
+for dataset in train dev test; do
+    log "Creating the ./data/${dataset} folders"
+    mkdir -p ./data/${dataset}
+done
+
+# generate the utt2spk, wav.scp and text files
+log "Generating the utt2spk, wav.scp and text files"
+python3 ./local/data_prep.py --train_val_path ${LRS3}/trainval --test_path ${LRS3}/test 
+
+log "Generating the spk2utt files"
+utils/utt2spk_to_spk2utt.pl data/train/utt2spk > data/train/spk2utt
+utils/utt2spk_to_spk2utt.pl data/dev/utt2spk > data/dev/spk2utt
+utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt
+
+log "Fix sorting issues by calling fix_data_dir.sh"
+utils/fix_data_dir.sh data/train
+utils/fix_data_dir.sh data/test
+utils/fix_data_dir.sh data/dev
+
+log "Validate the data directory"
+utils/validate_data_dir.sh data/train --no-feats
+utils/validate_data_dir.sh data/test --no-feats
+utils/validate_data_dir.sh data/dev --no-feats
diff --git a/egs2/lrs3/asr1/local/data_prep.py b/egs2/lrs3/asr1/local/data_prep.py
new file mode 100644
index 00000000000..2ba8c7a816b
--- /dev/null
+++ b/egs2/lrs3/asr1/local/data_prep.py
@@ -0,0 +1,192 @@
+#!/usr/bin/env python
+
+# Copyright 2022  Debayan Ghosh
+#           2022  Carnegie Mellon University
+# Apache 2.0
+
+
+import os
+import argparse
+import logging
+import numpy as np
+from pathlib import Path
+from typing import Union, List
+
+
+class Utils:
+    @staticmethod
+    def read_text(text_file: str) -> str:
+        """Extracts the transcript from the database-reference text file
+
+        Args:
+        text_file (str) : Path to the database-reference text file
+
+        Return:
+        (str) The text transcript
+        """
+        with open(text_file, encoding="ISO-8859-1") as f:
+            first_line = f.readline()
+        text_val = first_line.split("Text:")[1]
+        text_val = text_val.strip("\n")
+        text_val = text_val.replace(
+            "{LG}", ""
+        )  # Special code to avoid scoring seg-fault due to utterance n706Sqp20Mk_50005
+        return text_val
+
+    @staticmethod
+    def save_list_to_file(list_data: list, save_path: str) -> None:
+        """ "Writes content of list_data to a file, line-by-line
+
+        Args:
+        list_data: List of Text to be saved to the text file
+        save_path: file to save the list_data
+        """
+        with open(save_path, "w") as f:
+            for line in list_data:
+                f.write(line + "\n")
+
+    @staticmethod
+    def get_parser():
+        """Returns the Parser object required to take inputs to data_prep.py"""
+        parser = argparse.ArgumentParser(
+            description="LRS-3 Data Preparation steps",
+            formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        )
+        parser.add_argument(
+            "--train_val_path", type=str, help="Path to the Train/ Validation files"
+        )
+        parser.add_argument("--test_path", type=str, help="Path to the Test files")
+        return parser
+
+
+class DatasetUtils:
+    @staticmethod
+    def train_val_files(
+        train_val_path: str, train_val_ratio: float = 0.92, random_seed: int = 0
+    ) -> Union[List[str], List[str]]:
+        """Splits the folders in 'train_val_path' into the train set and test set,
+           and returns the full Train/Validation files.
+
+        Args:
+        train_val_path (str): Path to the Folder with the Train/Val data
+        train_val_ratio (float): Ratio of the Train/Test file ratio
+        random_seed (int): Seed for the file shufling
+
+        Returns:
+        speakers_train (list) : Paths of Speaker Folders for Training Data
+        speakers_val (list) : Paths of Speaker Folders for Validation Data
+        """
+        speaker_folders = os.listdir(train_val_path)
+
+        np.random.seed(random_seed)
+        np.random.shuffle(speaker_folders)
+        num_speakers = len(speaker_folders)
+
+        num_train = int(train_val_ratio * num_speakers)
+        speakers_train = speaker_folders[0:num_train]
+        speakers_val = speaker_folders[num_train:]
+
+        speakers_train = [
+            os.path.join(train_val_path, folder) for folder in speakers_train
+        ]
+        speakers_val = [os.path.join(train_val_path, folder) for folder in speakers_val]
+
+        return speakers_train, speakers_val
+
+    @staticmethod
+    def test_files(test_path: str) -> List[str]:
+        """Returns the full path to the Test files
+
+        Args:
+        test_path (str): Path to the Folder with the Test data
+
+        Returns:
+        speakers_test (list) : Paths of Speaker Folders for Test Data
+        """
+        speakers_test = os.listdir(test_path)
+        speakers_test = [os.path.join(test_path, folder) for folder in speakers_test]
+        return speakers_test
+
+    @staticmethod
+    def generate_espnet_data(
+        speaker_folders: list, dataset: str
+    ) -> Union[List[str], List[str], List[str]]:
+        """Generates the utt2spk, text and wav data required by ESPNET
+
+        Args:
+        speaker_folders (list): The folders from where to extract data
+        dataset (str): The dataset we are working with (train, test, dev)
+
+        Returns:
+        utt2spk (list) : Utterence to Speaker data
+        text (list) : Utterence to Transcript data
+        wav (list) : Utterence to Wav-Path data
+        """
+        utt2spk = []
+        text = []
+        wav = []
+
+        for speaker_folder in speaker_folders:
+
+            spk_id = os.path.basename(speaker_folder)
+
+            for wav_file in os.listdir(speaker_folder):
+
+                if not wav_file.endswith(".wav"):
+                    continue
+                text_file = wav_file.replace("wav", "txt")
+
+                wav_full_path = os.path.join(speaker_folder, wav_file)
+                text_full_path = os.path.join(speaker_folder, text_file)
+
+                assert os.path.exists(wav_full_path)
+                assert os.path.exists(text_full_path)
+
+                utt_id = spk_id + "_" + Path(wav_full_path).stem
+
+                utt2spk.append(utt_id + " " + spk_id)
+                wav.append(utt_id + " " + wav_full_path)
+                text.append(utt_id + " " + Utils.read_text(text_full_path))
+        return utt2spk, text, wav
+
+    @staticmethod
+    def perform_data_prep(speaker_folders: list, dataset: str) -> None:
+        """Performs ESPNET related Data-Preparation.
+        Generates the utt2spk, text and wav.scp files
+
+        Args:
+        speaker_folders (list): The folders from where to extract data
+        dataset (str): The dataset we are working with (train, test, dev)
+        """
+        utt2spk, text, wav = DatasetUtils.generate_espnet_data(speaker_folders, dataset)
+
+        utt2spk_file = os.path.join("data", dataset, "utt2spk")
+        text_file = os.path.join("data", dataset, "text")
+        wav_file = os.path.join("data", dataset, "wav.scp")
+
+        Utils.save_list_to_file(utt2spk, utt2spk_file)
+        Utils.save_list_to_file(text, text_file)
+        Utils.save_list_to_file(wav, wav_file)
+
+
+def main():
+    parser = Utils.get_parser()
+    args = parser.parse_args()
+    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
+    logging.basicConfig(level=logging.INFO, format=logfmt)
+
+    train_folders, dev_folders = DatasetUtils.train_val_files(args.train_val_path)
+    test_folders = DatasetUtils.test_files(args.test_path)
+
+    logging.info(f"Performing Data Preparation for TEST")
+    DatasetUtils.perform_data_prep(test_folders, "test")
+
+    logging.info(f"Performing Data Preparation for TRAIN")
+    DatasetUtils.perform_data_prep(train_folders, "train")
+
+    logging.info(f"Performing Data Preparation for DEV")
+    DatasetUtils.perform_data_prep(dev_folders, "dev")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs2/lrs3/asr1/local/download_and_untar.sh b/egs2/lrs3/asr1/local/download_and_untar.sh
new file mode 100755
index 00000000000..e0960f4b9ee
--- /dev/null
+++ b/egs2/lrs3/asr1/local/download_and_untar.sh
@@ -0,0 +1,82 @@
+#!/usr/bin/env bash
+
+remove_archive=false
+
+if [ "$1" == --remove-archive ]; then
+  remove_archive=true
+  shift
+fi
+
+if [ $# -lt 3 ]; then
+  echo "Usage: $0 [--remove-archive] <data-base-path> <lrs3-username> <lrs3-password>"
+  echo "--args [--remove-archive] (Optional) : Remove tar files after successfully untaring"
+  echo "--args <data-base-path> : The path where to download the dataset"
+  echo "--args <lrs3-username> : The username required to download the dataset"
+  echo "--args <lrs3-password> : The password required to download the dataset"
+  echo "If you do not have a username/password, please request from: https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs3.html"
+  exit 1
+fi
+
+data=$1
+lrs3_username=$2
+lrs3_password=$3
+lrs3_base_url=https://thor.robots.ox.ac.uk/~vgg/data/lip_reading/data3/
+lrs3_train_val_file=lrs3_trainval.zip
+lrs3_test_file=lrs3_test_v0.4.zip
+
+echo "Downloading Train/Val data from ${lrs3_base_url}${lrs3_train_val_file}"
+
+if [ -f ${data}/${lrs3_train_val_file} ]; then
+     rm  ${data}/${lrs3_train_val_file}
+fi
+
+if ! wget  --user ${lrs3_username} --password ${lrs3_password} -P $data  ${lrs3_base_url}${lrs3_train_val_file} ; then
+  echo "$0: error executing wget  --user ${lrs3_username} --password ${lrs3_password} -P $data  ${lrs3_base_url}${lrs3_train_val_file}"
+  exit 1
+fi
+
+echo "Downloading Test data from ${lrs3_base_url}${lrs3_test_file}"
+
+if [ -f ${data}/${lrs3_test_file} ]; then
+     rm  ${data}/${lrs3_test_file}
+fi
+
+if ! wget  --user ${lrs3_username} --password ${lrs3_password} -P $data   ${lrs3_base_url}${lrs3_test_file} ; then
+  echo "$0: error executing wget  --user ${lrs3_username} --password ${lrs3_password} -P $data   ${lrs3_base_url}${lrs3_test_file}"
+  exit 1
+fi
+
+
+if [ -e ${data}/trainval ]; then
+    echo "Removing existing files in ${data}/trainval before unzipping"
+    rm -rf ${data}/trainval
+fi
+
+echo "Un-Zipping Train/Val data from ${data}/${lrs3_train_val_file}"
+
+if ! unzip -qq ${data}/${lrs3_train_val_file} -d ${data}; then
+    echo "Failed to unzip ${data}/${lrs3_train_val_file}"
+    exit 1
+fi
+
+
+if [ -e ${data}/test ]; then
+    echo "Removing existing files in ${data}/test before unzipping"
+    rm -rf ${data}/test
+fi
+
+echo "Un-Zipping Test data from ${data}/${lrs3_test_file}"
+
+if ! unzip -qq ${data}/${lrs3_test_file} -d ${data}; then
+    echo "Failed to unzip ${data}/${lrs3_test_file}"
+    exit 1
+fi
+
+echo "$0: Successfully downloaded and un-tarred ${data}/${lrs3_train_val_file} and ${data}/${lrs3_test_file}"
+
+if $remove_archive; then
+  echo "$0: removing${data}/${lrs3_train_val_file} and  ${data}/${lrs3_test_file} file since --remove-archive option was supplied."
+  rm ${data}/${lrs3_train_val_file}
+  rm ${data}/${lrs3_test_file}
+fi
+
diff --git a/egs2/lrs3/asr1/local/mp4_to_wav.sh b/egs2/lrs3/asr1/local/mp4_to_wav.sh
new file mode 100755
index 00000000000..374d80de631
--- /dev/null
+++ b/egs2/lrs3/asr1/local/mp4_to_wav.sh
@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+
+
+if [ $# -lt 1 ]; then
+  echo "Usage: $0 <data-base-path>"
+  echo "--args <data-base-path> : The path to the dataset"
+  exit 1
+fi
+
+data=$1
+
+for dataset in trainval test; do
+    for mp4_path in ${data}/${dataset}/*/*.mp4; do
+        # Store the .wav file in the same folder where the .mp4 file is
+        wav_path=${mp4_path//.mp4/.wav}
+        if ! [ -f  ${wav_path} ]; then 
+          ffmpeg -y -i ${mp4_path} -loglevel panic -ar 16000 -ac 1 ${wav_path} 
+        fi
+    done
+done
diff --git a/egs2/lrs3/asr1/local/path.sh b/egs2/lrs3/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/lrs3/asr1/path.sh b/egs2/lrs3/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/lrs3/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/lrs3/asr1/pyscripts b/egs2/lrs3/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/lrs3/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/lrs3/asr1/run.sh b/egs2/lrs3/asr1/run.sh
new file mode 100755
index 00000000000..54d7f17454d
--- /dev/null
+++ b/egs2/lrs3/asr1/run.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set="train"
+valid_set="dev"
+test_sets="test dev"
+
+
+asr_tag=train_asr_transformer
+asr_config=conf/train_asr_transformer.yaml
+lm_config=conf/train_lm.yaml  # Not Used, as use_lm=false
+
+./asr.sh \
+    --skip_data_prep false \
+    --skip_train false \
+    --skip_eval false \
+    --stage 1 \
+    --lang en \
+    --ngpu 1 \
+    --nj 32 \
+    --inference_nj 32 \
+    --nbpe 5000 \
+    --max_wav_duration 30 \
+    --speed_perturb_factors "0.9 1.0 1.1" \
+    --audio_format "wav" \
+    --feats_type raw \
+    --use_lm false \
+    --asr_tag "${asr_tag}" \
+    --lm_config ${lm_config} \
+    --asr_config "${asr_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --lm_train_text "data/${train_set}/text" \
+    --bpe_train_text "data/${train_set}/text" "$@"
diff --git a/egs2/lrs3/asr1/scripts b/egs2/lrs3/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/lrs3/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/lrs3/asr1/steps b/egs2/lrs3/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/lrs3/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/lrs3/asr1/utils b/egs2/lrs3/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/lrs3/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/mr_openslr64/asr1/README.md b/egs2/mr_openslr64/asr1/README.md
new file mode 100644
index 00000000000..0e6848f9c27
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/README.md
@@ -0,0 +1,36 @@
+# RESULTS
+## Environments
+- date: `Mon Mar 21 16:06:03 UTC 2022`
+- python version: `3.9.7 (default, Sep 16 2021, 13:09:58)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.11.0+cu102`
+- Git hash: `91325a1e58ca0b13494b94bf79b186b095fe0b58`
+  - Commit date: `Mon Mar 21 00:40:52 2022 +0000`
+
+## asr_train_asr_conformer_xlsr_raw_bpe150_sp
+
+This recipe is for the Marathi language and is trained on the [OpenSLR Marathi](https://www.openslr.org/64/) multi-speaker speech data set.
+
+The following results are obtained by using an XLSR frontend.
+
+Train ASR Config: [conf/tuning/train_asr_conformer_xlsr.yaml](conf/tuning/train_asr_conformer_xlsr.yaml)
+
+Trained Model: [espnet/marathi_openslr64](https://huggingface.co/espnet/marathi_openslr64)
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_batch_size1_asr_model_valid.acc.ave/marathi_test|299|3625|72.9|22.5|4.7|1.7|28.9|88.6|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_batch_size1_asr_model_valid.acc.ave/marathi_test|299|20557|91.4|3.1|5.5|1.9|10.5|88.6|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_batch_size1_asr_model_valid.acc.ave/marathi_test|299|13562|86.5|6.3|7.1|1.4|14.9|88.6|
diff --git a/egs2/mr_openslr64/asr1/asr.sh b/egs2/mr_openslr64/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/mr_openslr64/asr1/cmd.sh b/egs2/mr_openslr64/asr1/cmd.sh
new file mode 120000
index 00000000000..f77e339f822
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/cmd.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/cmd.sh
\ No newline at end of file
diff --git a/egs2/mr_openslr64/asr1/conf/decode_asr.yaml b/egs2/mr_openslr64/asr1/conf/decode_asr.yaml
new file mode 120000
index 00000000000..f3f59d5ac2b
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/decode_asr.yaml
@@ -0,0 +1 @@
+tuning/decode_transformer.yaml
\ No newline at end of file
diff --git a/egs2/mr_openslr64/asr1/conf/fbank.conf b/egs2/mr_openslr64/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/mr_openslr64/asr1/conf/pbs.conf b/egs2/mr_openslr64/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/mr_openslr64/asr1/conf/pitch.conf b/egs2/mr_openslr64/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/mr_openslr64/asr1/conf/queue.conf b/egs2/mr_openslr64/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/mr_openslr64/asr1/conf/slurm.conf b/egs2/mr_openslr64/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/mr_openslr64/asr1/conf/train_asr.yaml b/egs2/mr_openslr64/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..10fea4f0971
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+tuning/train_asr_conformer.yaml
\ No newline at end of file
diff --git a/egs2/mr_openslr64/asr1/conf/train_lm.yaml b/egs2/mr_openslr64/asr1/conf/train_lm.yaml
new file mode 100644
index 00000000000..5bc7b7fcbfc
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/train_lm.yaml
@@ -0,0 +1,14 @@
+lm_conf:
+    nlayers: 2
+    unit: 650
+optim: sgd        # or adam
+batch_type: folded
+batch_size: 64   # batch size in LM training
+max_epoch: 20      # if the data size is large, we can reduce this
+patience: 3
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
diff --git a/egs2/mr_openslr64/asr1/conf/tuning/decode_rnn.yaml b/egs2/mr_openslr64/asr1/conf/tuning/decode_rnn.yaml
new file mode 100644
index 00000000000..169f05a1ad9
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/tuning/decode_rnn.yaml
@@ -0,0 +1,6 @@
+lm_weight: 0.3
+beam_size: 20
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.6
diff --git a/egs2/mr_openslr64/asr1/conf/tuning/decode_transformer.yaml b/egs2/mr_openslr64/asr1/conf/tuning/decode_transformer.yaml
new file mode 100644
index 00000000000..d8671a16988
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/tuning/decode_transformer.yaml
@@ -0,0 +1,7 @@
+batch_size: 16
+beam_size: 20
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.5
+lm_weight: 0.3
diff --git a/egs2/mr_openslr64/asr1/conf/tuning/train_asr_conformer.yaml b/egs2/mr_openslr64/asr1/conf/tuning/train_asr_conformer.yaml
new file mode 100644
index 00000000000..f47a0df534c
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/tuning/train_asr_conformer.yaml
@@ -0,0 +1,67 @@
+batch_type: numel
+batch_bins: 10000
+accum_grad: 3
+max_epoch: 60
+patience: none
+init: xavier_uniform
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 5
+
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 3
+    dropout_rate: 0.3
+    positional_dropout_rate: 0.3
+    attention_dropout_rate: 0.3
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: false
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 17
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 3
+    dropout_rate: 0.3
+    positional_dropout_rate: 0.3
+    self_attention_dropout_rate: 0.3
+    src_attention_dropout_rate: 0.3
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 20000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/mr_openslr64/asr1/conf/tuning/train_asr_conformer_xlsr.yaml b/egs2/mr_openslr64/asr1/conf/tuning/train_asr_conformer_xlsr.yaml
new file mode 100644
index 00000000000..1dbd14da380
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/tuning/train_asr_conformer_xlsr.yaml
@@ -0,0 +1,88 @@
+batch_type: numel
+batch_bins: 10000
+accum_grad: 3
+max_epoch: 60
+patience: none
+init: xavier_uniform
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 5
+
+freeze_param: [
+"frontend.upstream"
+]
+
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+frontend: s3prl
+frontend_conf:
+  frontend_conf:
+    upstream: wav2vec2_xlsr  # Note: If the upstream is changed, please change the input_size in the preencoder.
+  download_dir: ./hub
+  multilayer_feature: True
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 80
+
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 3
+    dropout_rate: 0.3
+    positional_dropout_rate: 0.3
+    attention_dropout_rate: 0.3
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: false
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 17
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 3
+    dropout_rate: 0.3
+    positional_dropout_rate: 0.3
+    self_attention_dropout_rate: 0.3
+    src_attention_dropout_rate: 0.3
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+optim: adam
+optim_conf:
+    lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 20000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/mr_openslr64/asr1/conf/tuning/train_asr_rnn.yaml b/egs2/mr_openslr64/asr1/conf/tuning/train_asr_rnn.yaml
new file mode 100644
index 00000000000..f55ad9d72bd
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/tuning/train_asr_rnn.yaml
@@ -0,0 +1,35 @@
+# network architecture
+# encoder related
+encoder: vgg_rnn
+encoder_conf:
+    rnn_type: lstm     # encoder architecture type
+    bidirectional: True
+    use_projection: True
+    num_layers: 4
+    hidden_size: 1024
+    output_size: 1024
+
+# decoder related
+decoder: rnn
+decoder_conf:
+    num_layers: 2
+    hidden_size: 1024
+    sampling_probability: 0
+    att_conf:
+        atype: location
+        adim: 1024
+        aconv_chans: 10
+        aconv_filts: 100
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.5
+
+# minibatch related
+batch_size: 30
+
+# optimization related
+optim: adadelta
+max_epoch: 15
+patience: 3
+
diff --git a/egs2/mr_openslr64/asr1/conf/tuning/train_asr_transformer.yaml b/egs2/mr_openslr64/asr1/conf/tuning/train_asr_transformer.yaml
new file mode 100644
index 00000000000..a0a37a5c0e4
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/tuning/train_asr_transformer.yaml
@@ -0,0 +1,68 @@
+# This configuration requires 4 GPUs with 32GB memory
+batch_type: numel
+batch_bins: 10000
+accum_grad: 3
+max_epoch: 60
+patience: none
+init: xavier_uniform
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 5
+
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 3
+    dropout_rate: 0.3
+    positional_dropout_rate: 0.3
+    attention_dropout_rate: 0.3
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: false
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 17
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 3
+    dropout_rate: 0.3
+    positional_dropout_rate: 0.3
+    self_attention_dropout_rate: 0.3
+    src_attention_dropout_rate: 0.3
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 20000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
\ No newline at end of file
diff --git a/egs2/mr_openslr64/asr1/db.sh b/egs2/mr_openslr64/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/mr_openslr64/asr1/local/data.sh b/egs2/mr_openslr64/asr1/local/data.sh
new file mode 100755
index 00000000000..c296b907d59
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/local/data.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+# general configuration
+stage=0       # start from 0 if you need to start from data preparation
+stop_stage=1
+# inclusive, was 100
+SECONDS=0
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+. utils/parse_options.sh
+
+log "data preparation started"
+
+if [ -z "$MARATHI" ]; then
+    log "Variable MARATHI not set in db.sh"
+    exit 2
+fi
+
+mkdir -p ${MARATHI}
+
+workspace=$PWD
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "sub-stage 0: Download Data to downloads"
+
+    cd ${MARATHI}
+    wget https://www.openslr.org/resources/64/mr_in_female.zip
+    unzip -o mr_in_female.zip
+    rm -f mr_in_female.zip
+    
+    cd $workspace    
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "sub-stage 1: Preparing Data for openslr"
+
+    python3 local/data_prep.py -d ${MARATHI}
+    utils/spk2utt_to_utt2spk.pl data/marathi_train/spk2utt > data/marathi_train/utt2spk
+    utils/spk2utt_to_utt2spk.pl data/marathi_dev/spk2utt > data/marathi_dev/utt2spk
+    utils/spk2utt_to_utt2spk.pl data/marathi_test/spk2utt > data/marathi_test/utt2spk
+    utils/fix_data_dir.sh data/marathi_train
+    utils/fix_data_dir.sh data/marathi_dev
+    utils/fix_data_dir.sh data/marathi_test
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/mr_openslr64/asr1/local/data_prep.py b/egs2/mr_openslr64/asr1/local/data_prep.py
new file mode 100644
index 00000000000..ed446ef71ae
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/local/data_prep.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+
+# Copyright 2021 Carnegie Mellon University (Peter Wu)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+
+import argparse
+import os
+import random
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-d", help="downloads directory", type=str, default="downloads")
+    args = parser.parse_args()
+
+    tsv_path = "%s/line_index.tsv" % args.d
+
+    with open(tsv_path, "r") as inf:
+        tsv_lines = inf.readlines()
+    tsv_lines = [line.strip() for line in tsv_lines]
+
+    spk2utt = {}
+    utt2text = {}
+    for line in tsv_lines:
+        l_list = line.split("\t")
+        fid = l_list[0]
+        spk = l_list[0].split("_")[1]
+        text = l_list[1]
+        path = "%s/%s.wav" % (args.d, fid)
+        if os.path.exists(path):
+            utt2text[fid] = text
+            if spk in spk2utt:
+                spk2utt[spk].append(fid)
+            else:
+                spk2utt[spk] = [fid]
+
+    spks = sorted(list(spk2utt.keys()))
+    num_fids = 0
+    num_test_spks = 0
+    for spk in spks:
+        num_test_spks += 1
+        fids = sorted(list(set(spk2utt[spk])))
+        num_fids += len(fids)
+        if num_fids >= 2000:
+            break
+
+    num_test_spks = 2
+    test_spks = spks[:num_test_spks]
+    train_dev_spks = spks[num_test_spks:]
+    random.Random(0).shuffle(train_dev_spks)
+    num_train = int(len(train_dev_spks) * 0.9)
+    train_spks = train_dev_spks[:num_train]
+    dev_spks = train_dev_spks[num_train:]
+
+    spks_by_phase = {"train": train_spks, "dev": dev_spks, "test": test_spks}
+    flac_dir = "%s" % args.d
+    sr = 16000
+    for phase in spks_by_phase:
+        spks = spks_by_phase[phase]
+        text_strs = []
+        wav_scp_strs = []
+        spk2utt_strs = []
+        num_fids = 0
+        for spk in spks:
+            fids = sorted(list(set(spk2utt[spk])))
+            num_fids += len(fids)
+            if phase == "test" and num_fids > 2000:
+                curr_num_fids = num_fids - 2000
+                random.Random(1).shuffle(fids)
+                fids = fids[:curr_num_fids]
+            utts = [spk + "-" + f for f in fids]
+            utts_str = " ".join(utts)
+            spk2utt_strs.append("%s %s" % (spk, utts_str))
+            for fid, utt in zip(fids, utts):
+                cmd = "ffmpeg -i %s/%s.wav -f wav -ar %d -ab 16 -ac 1 - |" % (
+                    flac_dir,
+                    fid,
+                    sr,
+                )
+                text_strs.append("%s %s" % (utt, utt2text[fid]))
+                wav_scp_strs.append("%s %s" % (utt, cmd))
+        phase_dir = "data/marathi_%s" % phase
+        if not os.path.exists(phase_dir):
+            os.makedirs(phase_dir)
+
+        text_strs = sorted(text_strs)
+        wav_scp_strs = sorted(wav_scp_strs)
+        spk2utt_strs = sorted(spk2utt_strs)
+
+        with open(os.path.join(phase_dir, "text"), "w+") as ouf:
+            for s in text_strs:
+                ouf.write("%s\n" % s)
+        with open(os.path.join(phase_dir, "wav.scp"), "w+") as ouf:
+            for s in wav_scp_strs:
+                ouf.write("%s\n" % s)
+        with open(os.path.join(phase_dir, "spk2utt"), "w+") as ouf:
+            for s in spk2utt_strs:
+                ouf.write("%s\n" % s)
diff --git a/egs2/mr_openslr64/asr1/local/path.sh b/egs2/mr_openslr64/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/mr_openslr64/asr1/path.sh b/egs2/mr_openslr64/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/mr_openslr64/asr1/pyscripts b/egs2/mr_openslr64/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/mr_openslr64/asr1/run.sh b/egs2/mr_openslr64/asr1/run.sh
new file mode 100755
index 00000000000..4b3fced2fb5
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/run.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+lid=false # whether to use language id as additional label
+
+train_set="marathi_train"
+train_dev="marathi_dev"
+test_set="marathi_test"
+
+asr_config=conf/tuning/train_asr_conformer_xlsr.yaml
+inference_config=conf/decode_asr.yaml
+
+ngpu=1
+
+./asr.sh \
+    --stage 1 \
+    --stop_stage 100 \
+    --ngpu ${ngpu} \
+    --nj 10 \
+    --inference_nj 10 \
+    --gpu_inference true \
+    --audio_format "wav" \
+    --inference_args "--batch_size 1" \
+    --use_lm false \
+    --token_type bpe \
+    --nbpe 150 \
+    --feats_type raw \
+    --feats_normalize utt_mvn \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${train_dev}" \
+    --test_sets "${test_set}" \
+    --speed_perturb_factors "0.9 1.0 1.1" \
+    --bpe_train_text "data/${train_set}/text" \
+    --lm_train_text "data/${train_set}/text" \
+    --local_score_opts "--score_lang_id ${lid}" "$@"
diff --git a/egs2/mr_openslr64/asr1/scripts b/egs2/mr_openslr64/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/mr_openslr64/asr1/steps b/egs2/mr_openslr64/asr1/steps
new file mode 120000
index 00000000000..69ab7056139
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/steps
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/steps
\ No newline at end of file
diff --git a/egs2/mr_openslr64/asr1/utils b/egs2/mr_openslr64/asr1/utils
new file mode 120000
index 00000000000..e18ae14b549
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/utils
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/utils
\ No newline at end of file
diff --git a/egs2/ms_indic_18/asr1/README.md b/egs2/ms_indic_18/asr1/README.md
new file mode 100644
index 00000000000..b88cf2fee85
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/README.md
@@ -0,0 +1,94 @@
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+
+# RESULTS
+## Environments
+- date: `Tue Mar 22 13:38:24 EDT 2022`
+- python version: `3.9.5 (default, Jun  4 2021, 12:28:51)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.8.1+cu111`
+- Git hash: `f91410f712d1287cd6809c5bf26b54c5a40fe314`
+  - Commit date: `Mon Mar 14 22:32:17 2022 -0400`
+- Pretrained model: [espnet/chai_microsoft_indian_langs_te](https://huggingface.co/espnet/chai_microsoft_indian_langs_te)
+
+## Self-supervised learning features [wav2vec2_xlsr, Conformer, utt_mvn](conf/tuning/train_asr_xlsr53_conformer.yaml) with [Transformer-LM](conf/tuning/train_lm_transformer.yaml) and [RNN-LM](conf/tuning/train_lm_rnn.yaml). During inference, all below models use the same [decoding parameters](conf/tuning/decode_asr_transformer.yaml).
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_te|3040|28413|78.0|19.5|2.5|2.4|24.4|80.1|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.best_asr_model_valid.acc.ave/test_te|3040|28413|78.0|19.4|2.6|2.4|24.4|79.7|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_te|3040|28413|78.0|19.5|2.6|2.5|24.5|79.9|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_te|3040|229419|95.6|2.2|2.2|1.6|6.1|80.1|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.best_asr_model_valid.acc.ave/test_te|3040|229419|95.6|2.2|2.2|1.6|6.0|79.7|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_te|3040|229419|95.6|2.1|2.2|1.6|6.0|79.9|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_te|3040|146657|92.7|4.7|2.6|1.6|8.9|80.1|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.best_asr_model_valid.acc.ave/test_te|3040|146657|92.8|4.7|2.6|1.6|8.9|79.7|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_te|3040|146657|92.8|4.6|2.6|1.6|8.9|79.9|
+
+
+
+## Self-supervised learning features [wav2vec2_large_ll60k, Transformer, utt_mvn](conf/tuning/train_asr_wav2vec2.yaml) with [Transformer-LM](conf/tuning/train_lm_transformer.yaml) and [RNN-LM](conf/tuning/train_lm_rnn.yaml)
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_te|3040|28413|77.3|20.3|2.4|2.9|25.6|79.5|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.best_asr_model_valid.acc.ave/test_te|3040|28413|77.4|20.1|2.5|2.8|25.3|79.3|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_te|3040|28413|77.5|20.1|2.4|2.8|25.3|79.6|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_te|3040|229419|95.0|2.5|2.5|1.8|6.8|79.5|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.best_asr_model_valid.acc.ave/test_te|3040|229419|95.1|2.4|2.5|1.8|6.7|79.3|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_te|3040|229419|95.1|2.4|2.5|1.7|6.6|79.6|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_te|3040|146657|91.8|5.2|3.0|1.8|9.9|79.5|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.best_asr_model_valid.acc.ave/test_te|3040|146657|91.9|5.1|2.9|1.8|9.8|79.3|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_te|3040|146657|92.0|5.1|3.0|1.7|9.7|79.6|
+
+
+
+## Standard ASR model based on [Transformer](conf/tuning/train_asr_transformer.yaml) with [Transformer-LM](conf/tuning/train_lm_transformer.yaml) and [RNN-LM](conf/tuning/train_lm_rnn.yaml)
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave_10best/test_te|3040|28413|75.7|22.0|2.4|3.2|27.6|82.7|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.best_asr_model_valid.acc.ave_10best/test_te|3040|28413|75.9|21.8|2.4|3.1|27.3|82.2|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave_10best/test_te|3040|28413|76.1|21.5|2.4|3.1|27.0|82.6|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave_10best/test_te|3040|229419|94.7|2.7|2.6|2.0|7.3|82.7|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.best_asr_model_valid.acc.ave_10best/test_te|3040|229419|94.8|2.7|2.6|2.0|7.2|82.2|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave_10best/test_te|3040|229419|94.8|2.6|2.6|2.0|7.1|82.6|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave_10best/test_te|3040|146657|91.1|5.8|3.1|2.0|10.9|82.7|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.best_asr_model_valid.acc.ave_10best/test_te|3040|146657|91.3|5.7|3.1|2.0|10.7|82.2|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave_10best/test_te|3040|146657|91.4|5.5|3.1|1.9|10.6|82.6|
+
diff --git a/egs2/ms_indic_18/asr1/asr.sh b/egs2/ms_indic_18/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/ms_indic_18/asr1/cmd.sh b/egs2/ms_indic_18/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/ms_indic_18/asr1/conf/decode_asr.yaml b/egs2/ms_indic_18/asr1/conf/decode_asr.yaml
new file mode 120000
index 00000000000..78955c67707
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/conf/decode_asr.yaml
@@ -0,0 +1 @@
+tuning/decode_asr_transformer.yaml
\ No newline at end of file
diff --git a/egs2/ms_indic_18/asr1/conf/fbank.conf b/egs2/ms_indic_18/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/ms_indic_18/asr1/conf/pbs.conf b/egs2/ms_indic_18/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/ms_indic_18/asr1/conf/pitch.conf b/egs2/ms_indic_18/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/ms_indic_18/asr1/conf/queue.conf b/egs2/ms_indic_18/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/ms_indic_18/asr1/conf/slurm.conf b/egs2/ms_indic_18/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/ms_indic_18/asr1/conf/train_asr.yaml b/egs2/ms_indic_18/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..00ea2f24791
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+tuning/train_asr_transformer.yaml
\ No newline at end of file
diff --git a/egs2/ms_indic_18/asr1/conf/train_lm.yaml b/egs2/ms_indic_18/asr1/conf/train_lm.yaml
new file mode 120000
index 00000000000..dd50b722db8
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/conf/train_lm.yaml
@@ -0,0 +1 @@
+tuning/train_lm_rnn.yaml
\ No newline at end of file
diff --git a/egs2/ms_indic_18/asr1/conf/tuning/decode_asr_transformer.yaml b/egs2/ms_indic_18/asr1/conf/tuning/decode_asr_transformer.yaml
new file mode 100644
index 00000000000..a3debc6a257
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/conf/tuning/decode_asr_transformer.yaml
@@ -0,0 +1,7 @@
+batch_size: 1
+beam_size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.4
+lm_weight: 0.4
diff --git a/egs2/ms_indic_18/asr1/conf/tuning/train_asr_transformer.yaml b/egs2/ms_indic_18/asr1/conf/tuning/train_asr_transformer.yaml
new file mode 100644
index 00000000000..454db5bfdbf
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/conf/tuning/train_asr_transformer.yaml
@@ -0,0 +1,55 @@
+cudnn_benchmark: false 
+cudnn_deterministic: false 
+use_amp: true 
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: "conv2d"
+    num_blocks: 6
+    linear_units: 1024
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+    attention_dropout_rate: 0.1
+
+# decoder related
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: "embed"
+    num_blocks: 6
+    linear_units: 1024
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# minibatch related
+batch_type: folded
+batch_size: 256
+
+# optimization related
+optim: adam
+accum_grad: 1
+grad_clip: 5
+patience: 3
+max_epoch: 50
+optim_conf:
+    lr: 0.001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 25000
+
+# others:
+best_model_criterion:
+-  - valid
+   - acc
+   - max
+keep_nbest_models: 10
+
+init: xavier_uniform # Our empirical studies shows that this initialization
+                     # is very important to low-resource ASR training
diff --git a/egs2/ms_indic_18/asr1/conf/tuning/train_asr_wav2vec2.yaml b/egs2/ms_indic_18/asr1/conf/tuning/train_asr_wav2vec2.yaml
new file mode 100644
index 00000000000..f61200b2ca5
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/conf/tuning/train_asr_wav2vec2.yaml
@@ -0,0 +1,93 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: "conv2d"
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+
+# decoder related
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: "embed"
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+#frontend related 
+#freeze_param: ["frontend.upstream"] 
+frontend: fused
+frontend_conf: 
+  frontends:
+    - frontend_type: default
+      n_fft: 512
+      win_length: 400
+      hop_length: 160
+
+    - frontend_type: s3prl
+      frontend_conf:
+        upstream: wav2vec2_large_ll60k
+      download_dir: ./hub
+      multilayer_feature: True
+
+  align_method: linear_projection
+  proj_dim: 100     
+
+preencoder: linear
+preencoder_conf:
+    input_size: 200  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 100
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.1
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+
+# minibatch related
+batch_type: folded
+batch_size: 64
+
+# optimization related
+optim: adam
+accum_grad: 1
+grad_clip: 5
+patience: 15
+max_epoch: 29
+optim_conf:
+    lr: 0.001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 20000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+# others:
+best_model_criterion:
+-  - valid
+   - acc
+   - max
+keep_nbest_models: 5
+
+init: xavier_uniform
diff --git a/egs2/ms_indic_18/asr1/conf/tuning/train_asr_xlsr53_conformer.yaml b/egs2/ms_indic_18/asr1/conf/tuning/train_asr_xlsr53_conformer.yaml
new file mode 100644
index 00000000000..509ca8fd7c2
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/conf/tuning/train_asr_xlsr53_conformer.yaml
@@ -0,0 +1,105 @@
+# network architecture
+encoder: conformer
+encoder_conf:
+  output_size: 256
+  attention_heads: 4
+  linear_units: 2048
+  num_blocks: 12
+  dropout_rate: 0.1
+  positional_dropout_rate: 0.1
+  attention_dropout_rate: 0.1
+  input_layer: conv2d
+  normalize_before: true
+  macaron_style: true
+  pos_enc_layer_type: "rel_pos"
+  selfattention_layer_type: "rel_selfattn"
+  activation_type: "swish"
+  use_cnn_module: true
+  cnn_module_kernel: 15
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: "embed"
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+#frontend related 
+freeze_param: [
+"frontend.upstream"
+]
+
+frontend: fused
+frontend_conf: 
+  frontends:
+    - frontend_type: default
+      n_fft: 512
+      win_length: 400
+      hop_length: 160
+
+    - frontend_type: s3prl
+      frontend_conf:
+        upstream: wav2vec2_xlsr
+      download_dir: ./hub
+      multilayer_feature: True
+
+  align_method: linear_projection
+  proj_dim: 200     
+
+preencoder: linear
+preencoder_conf:
+    input_size: 400  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 100
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+
+# minibatch related
+batch_type: folded
+batch_size: 64
+
+# optimization related
+optim: adam
+accum_grad: 1
+grad_clip: 5
+patience: 15
+max_epoch: 50
+optim_conf:
+    lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 30000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+# others:
+best_model_criterion:
+-  - valid
+   - acc
+   - max
+keep_nbest_models: 5
+
+init: xavier_uniform
diff --git a/egs2/ms_indic_18/asr1/conf/tuning/train_lm_rnn.yaml b/egs2/ms_indic_18/asr1/conf/tuning/train_lm_rnn.yaml
new file mode 100644
index 00000000000..a8cf1602296
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/conf/tuning/train_lm_rnn.yaml
@@ -0,0 +1,16 @@
+grad_clip: 5.0
+batch_type: folded
+batch_size: 512
+max_epoch: 30      # if the data size is large, we can reduce this
+optim: adam
+optim_conf:
+    lr: 0.0005
+lm: seq_rnn
+lm_conf:
+    unit: 650
+    nlayers: 4
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 3
diff --git a/egs2/ms_indic_18/asr1/conf/tuning/train_lm_transformer.yaml b/egs2/ms_indic_18/asr1/conf/tuning/train_lm_transformer.yaml
new file mode 100644
index 00000000000..3ce4f20d99d
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/conf/tuning/train_lm_transformer.yaml
@@ -0,0 +1,30 @@
+# Trained with Nvidia TESLA V100, with 16GM RAM, x4
+lm: transformer
+lm_conf:
+    pos_enc: null
+    embed_unit: 128
+    att_unit: 512
+    head: 8
+    unit: 2048
+    layer: 16
+    dropout_rate: 0.1
+
+# optimization related
+grad_clip: 5.0
+batch_type: numel
+batch_bins: 350000
+accum_grad: 2
+max_epoch: 25
+
+optim: adam
+optim_conf:
+   lr: 0.001
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 25000
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 3
diff --git a/egs2/ms_indic_18/asr1/db.sh b/egs2/ms_indic_18/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/ms_indic_18/asr1/local/data.sh b/egs2/ms_indic_18/asr1/local/data.sh
new file mode 100755
index 00000000000..9a9c709a206
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/local/data.sh
@@ -0,0 +1,54 @@
+#!/usr/bin/env bash
+
+# Copyright 2020 Johns Hopkins University (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+# general configuration
+stage=0       # start from 0 if you need to start from data preparation
+stop_stage=100
+SECONDS=0
+lang=te # te ta gu
+
+. utils/parse_options.sh || exit 1;
+
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+if [ -z "${MS_INDIC_IS18}" ]; then
+    log "Fill the value of 'MS_INDIC_IS18' of db.sh"
+    exit 1
+fi
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log "data preparation started"
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then 
+    if [[ ! -d "${MS_INDIC_IS18}/${lang}-in-Train" ]]; then
+        log "stage0: Download training data to ${MS_INDIC_IS18}. ${lang}-in-Train directory is missing"
+        exit 1
+    elif [[ ! -d "${MS_INDIC_IS18}/${lang}-in-Test" ]]; then
+        log "stage0: Download test data to ${MS_INDIC_IS18}. ${lang}-in-Test directory is missing"
+        exit 1
+    fi
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage1: Preparing data for Microsoft Speech Corpus (Indian languages)"
+    ### Task dependent. You have to make data the following preparation part by yourself.
+    local/prepare_data.py ${MS_INDIC_IS18} ${lang}
+fi
+
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/ms_indic_18/asr1/local/path.sh b/egs2/ms_indic_18/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/ms_indic_18/asr1/local/prepare_data.py b/egs2/ms_indic_18/asr1/local/prepare_data.py
new file mode 100755
index 00000000000..464a1f43b11
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/local/prepare_data.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+
+# Copyright 2021  Chaitanya Narisetty
+#           2021  Carnegie Mellon University
+# Apache 2.0
+
+
+import os
+import random
+import sys
+import librosa
+
+
+if len(sys.argv) != 3:
+    print("Usage: python prepare_data.py [data-directory] [language-ID]")
+    sys.exit(1)
+
+datadir = sys.argv[1]
+lang = sys.argv[2]
+
+traindir = f"{datadir}/{lang}-in-Train/"
+testdir = f"{datadir}/{lang}-in-Test/"
+
+train_datadir = f"data/train_{lang}/"
+valid_datadir = f"data/dev_{lang}/"
+test_datadir = f"data/test_{lang}/"
+
+os.popen(f"mkdir -p {train_datadir}").read()
+os.popen(f"mkdir -p {valid_datadir}").read()
+os.popen(f"mkdir -p {test_datadir}").read()
+
+
+# prepare data for training and validation splits
+with open(traindir + "transcription.txt") as f:
+    train_lines = [line.rstrip() for line in f.readlines()]
+    train_id2text = {}
+    train_id2filepath = {}
+    for line in train_lines:
+        wav_id = line.split()[0]
+        filepath = f"{traindir}/Audios/{wav_id}.wav"
+        train_id2text[wav_id] = " ".join(line.split()[1:])
+        train_id2filepath[wav_id] = filepath
+
+wav_ids = list(train_id2text.keys())
+random.shuffle(wav_ids)
+valid_id2text = {}
+valid_totaldur = 2 * 60 * 60  # (in seconds) 2 hours taken for validation split
+for wav_id in wav_ids:
+    dur = librosa.get_duration(filename=train_id2filepath[wav_id])
+    valid_id2text[wav_id] = train_id2text.pop(wav_id)
+    valid_totaldur -= dur
+    if valid_totaldur < 0:
+        break
+
+
+with open(train_datadir + "text", "w") as f:
+    for wav_id in sorted(train_id2text):
+        f.write(f"{lang}_{wav_id} {train_id2text[wav_id]}\n")
+with open(train_datadir + "wav.scp", "w") as f:
+    for wav_id in sorted(train_id2text):
+        f.write(f"{lang}_{wav_id} {train_id2filepath[wav_id]}\n")
+with open(train_datadir + "spk2utt", "w") as f:
+    for wav_id in sorted(train_id2text):
+        f.write(f"spk_{lang}_{wav_id} {lang}_{wav_id}\n")
+with open(train_datadir + "utt2spk", "w") as f:
+    for wav_id in sorted(train_id2text):
+        f.write(f"{lang}_{wav_id} spk_{lang}_{wav_id}\n")
+
+with open(valid_datadir + "text", "w") as f:
+    for wav_id in sorted(valid_id2text):
+        f.write(f"{lang}_{wav_id} {valid_id2text[wav_id]}\n")
+with open(valid_datadir + "wav.scp", "w") as f:
+    for wav_id in sorted(valid_id2text):
+        f.write(f"{lang}_{wav_id} {train_id2filepath[wav_id]}\n")
+with open(valid_datadir + "spk2utt", "w") as f:
+    for wav_id in sorted(valid_id2text):
+        f.write(f"spk_{lang}_{wav_id} {lang}_{wav_id}\n")
+with open(valid_datadir + "utt2spk", "w") as f:
+    for wav_id in sorted(valid_id2text):
+        f.write(f"{lang}_{wav_id} spk_{lang}_{wav_id}\n")
+
+
+# prepare test data
+with open(testdir + "transcription.txt") as f:
+    test_lines = [line.rstrip() for line in f.readlines()]
+    test_id2text = {}
+    test_id2filepath = {}
+    for line in test_lines:
+        wav_id = line.split()[0]
+        filepath = f"{testdir}/Audios/{wav_id}.wav"
+        test_id2text[wav_id] = " ".join(line.split()[1:])
+        test_id2filepath[wav_id] = filepath
+
+with open(test_datadir + "text", "w") as f:
+    for wav_id in sorted(test_id2text):
+        f.write(f"{lang}_{wav_id} {test_id2text[wav_id]}\n")
+with open(test_datadir + "wav.scp", "w") as f:
+    for wav_id in sorted(test_id2text):
+        f.write(f"{lang}_{wav_id} {test_id2filepath[wav_id]}\n")
+with open(test_datadir + "spk2utt", "w") as f:
+    for wav_id in sorted(test_id2text):
+        f.write(f"spk_{lang}_{wav_id} {lang}_{wav_id}\n")
+with open(test_datadir + "utt2spk", "w") as f:
+    for wav_id in sorted(test_id2text):
+        f.write(f"{lang}_{wav_id} spk_{lang}_{wav_id}\n")
diff --git a/egs2/ms_indic_18/asr1/path.sh b/egs2/ms_indic_18/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/ms_indic_18/asr1/pyscripts b/egs2/ms_indic_18/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/ms_indic_18/asr1/run.sh b/egs2/ms_indic_18/asr1/run.sh
new file mode 100755
index 00000000000..e2a8c317a51
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/run.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+lang=te # te ta gu
+
+train_set=train_${lang}
+train_dev=dev_${lang}
+test_set="${train_dev} test_${lang}"
+
+asr_config=conf/train_asr.yaml
+lm_config=conf/train_lm.yaml
+inference_config=conf/decoder_asr.yaml
+
+if [[ "zh" == *"${lang}"* ]]; then # placeholder for optimal bpe when lang=te
+  nbpe=2500
+elif [[ "fr" == *"${lang}"* ]]; then # placeholder for optimal bpe when lang=ta
+  nbpe=350
+elif [[ "es" == *"${lang}"* ]]; then # placeholder for optimal bpe when lang=gu
+  nbpe=235
+else
+  nbpe=150
+fi
+
+
+./asr.sh \
+    --ngpu 1 \
+    --lang "${lang}" \
+    --local_data_opts "--lang ${lang}" \
+    --use_lm true \
+    --lm_config "${lm_config}" \
+    --token_type bpe \
+    --nbpe $nbpe \
+    --feats_type raw \
+    --speed_perturb_factors "0.9 1.0 1.1" \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --inference_asr_model valid.acc.ave.pth\
+    --train_set "${train_set}" \
+    --valid_set "${train_dev}" \
+    --test_sets "${test_set}" \
+    --min_wav_duration 0.5 \
+    --bpe_train_text "data/${train_set}/text" \
+    --lm_train_text "data/${train_set}/text" "$@"
+
diff --git a/egs2/ms_indic_18/asr1/scripts b/egs2/ms_indic_18/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/ms_indic_18/asr1/steps b/egs2/ms_indic_18/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/ms_indic_18/asr1/utils b/egs2/ms_indic_18/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/slue-voxceleb/asr1/README.md b/egs2/slue-voxceleb/asr1/README.md
index 9db9c64295a..3da3f39a411 100644
--- a/egs2/slue-voxceleb/asr1/README.md
+++ b/egs2/slue-voxceleb/asr1/README.md
@@ -8,17 +8,17 @@
 - pytorch version: `pytorch 1.8.1+cu102`
 - Git hash: `6bf3c2a4f138d35331634d2e879bbc5c32a5266e`
   - Commit date: `Mon Dec 22 15:41:32 EST 2021`
-- Pretrained Model
-  - Hugging Face : https://huggingface.co/espnet/siddhana_slue_asr_train_asr_conformer_raw_en_word_valid.acc.ave_10best
 
 
 ## Using Conformer based encoder and Transformer based decoder with spectral augmentation and predicting transcript along with intent
-- ASR config: [conf/train_asr.yaml](conf/tuning/train_asr_conformer.yaml)
+- ASR config: [conf/tuning/train_asr_conformer.yaml]
 - token_type: word
+- Pretrained Model
+  - Hugging Face : https://huggingface.co/espnet/siddhana_slue_asr_train_asr_conformer_raw_en_word_valid.acc.ave_10best
 
 |dataset|Snt|Intent Classification Accuracy (%)|Intent Classification Macro F1 (%)|
 |---|---|---|---|
-|inference_asr_model_valid.acc.ave_10best/devel|955|80.2|29.7|
+|inference_asr_model_valid.acc.ave_10best/devel|954|80.2|39.7|
 
 ### Detailed Classification Report
 
@@ -27,4 +27,20 @@
 |inference_asr_model_valid.acc.ave_10best/devel|Neutral|784|85|93|89|
 |inference_asr_model_valid.acc.ave_10best/devel|Positive|167|40|24|30|
 |inference_asr_model_valid.acc.ave_10best/devel|Negative|3|0|0|0|
-|inference_asr_model_valid.acc.ave_10best/devel|Mixed|1|0|0|0|
+
+## Using Conformer based encoder with output size 256 and Transformer based decoder with spectral augmentation and predicting transcript along with intent
+- ASR config: [conf/train_asr.yaml](conf/tuning/train_asr_wav2vec2_conformer_small.yaml)
+- token_type: word
+
+|dataset|Snt|Intent Classification Accuracy (%)|Intent Classification Macro F1 (%)|
+|---|---|---|---|
+|inference_asr_model_valid.acc.ave_10best/devel|954|79.0|44.0|
+
+### Detailed Classification Report
+
+|dataset|Label|Snt|Prec|Recall|F1|
+|---|---|---|---|---|---|
+|inference_asr_model_valid.acc.ave_10best/devel|Neutral|784|88|87|87|
+|inference_asr_model_valid.acc.ave_10best/devel|Positive|167|46|43|44|
+|inference_asr_model_valid.acc.ave_10best/devel|Negative|3|0|0|0|
+
diff --git a/egs2/slue-voxceleb/asr1/conf/train_asr.yaml b/egs2/slue-voxceleb/asr1/conf/train_asr.yaml
index 10fea4f0971..b79a2904e79 120000
--- a/egs2/slue-voxceleb/asr1/conf/train_asr.yaml
+++ b/egs2/slue-voxceleb/asr1/conf/train_asr.yaml
@@ -1 +1 @@
-tuning/train_asr_conformer.yaml
\ No newline at end of file
+tuning/train_asr_wav2vec2_conformer_small.yaml
\ No newline at end of file
diff --git a/egs2/slue-voxceleb/asr1/conf/tuning/train_asr_wav2vec2_conformer_small.yaml b/egs2/slue-voxceleb/asr1/conf/tuning/train_asr_wav2vec2_conformer_small.yaml
new file mode 100644
index 00000000000..fe8f902cb63
--- /dev/null
+++ b/egs2/slue-voxceleb/asr1/conf/tuning/train_asr_wav2vec2_conformer_small.yaml
@@ -0,0 +1,90 @@
+# network architecture
+# encoder related
+batch_type: numel
+batch_bins: 6000000
+encoder: conformer
+accum_grad: 2
+
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d2
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+optim: adam
+optim_conf:
+    lr: 0.002
+    weight_decay: 1.0e-06
+scheduler: warmuplr     # pytorch v1.1.0+ required #Tune warmup steps
+scheduler_conf:
+    warmup_steps: 5000
+max_epoch: 100
+
+freeze_param: [
+"frontend.upstream"
+]
+
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+frontend: s3prl
+frontend_conf:
+    frontend_conf:
+        upstream: wav2vec2_large_ll60k  # Note: If the upstream is changed, please change the input_size in the preencoder.
+    download_dir: ./hub
+    multilayer_feature: True
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 80
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
diff --git a/egs2/slue-voxceleb/asr1/local/data.sh b/egs2/slue-voxceleb/asr1/local/data.sh
index b42d26fe50c..3f266008f4f 100755
--- a/egs2/slue-voxceleb/asr1/local/data.sh
+++ b/egs2/slue-voxceleb/asr1/local/data.sh
@@ -52,7 +52,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     done
     local/run_spm.sh
     mv data data_old
-    mv data_bpe_500 data
+    mv data_bpe_1000 data
 fi
 
 log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/slue-voxceleb/asr1/local/data_prep_slue.py b/egs2/slue-voxceleb/asr1/local/data_prep_slue.py
index 524aa12b208..89b42059e30 100644
--- a/egs2/slue-voxceleb/asr1/local/data_prep_slue.py
+++ b/egs2/slue-voxceleb/asr1/local/data_prep_slue.py
@@ -42,6 +42,8 @@
                     "<blank>"  # Test set is blind, will have to submit to leaderboard
                 )
             else:
+                if row[4] == "<mixed>":
+                    continue
                 print(x)
                 print(row)
                 words = (
diff --git a/egs2/slue-voxceleb/asr1/local/f1_score.py b/egs2/slue-voxceleb/asr1/local/f1_score.py
new file mode 100755
index 00000000000..4f45752a812
--- /dev/null
+++ b/egs2/slue-voxceleb/asr1/local/f1_score.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env bash
+
+# Copyright 2021  Siddhant Arora
+#           2021  Carnegie Mellon University
+# Apache 2.0
+
+
+import os
+import re
+import sys
+import pandas as pd
+import argparse
+from sklearn.metrics import f1_score
+from sklearn.metrics import classification_report
+
+
+def get_classification_result(hyp_file, ref_file):
+    hyp_lines = [line for line in hyp_file]
+    ref_lines = [line for line in ref_file]
+
+    error = 0
+    hyp_intent_arr = []
+    ref_intent_arr = []
+    for line_count in range(len(hyp_lines)):
+        hyp_intent = hyp_lines[line_count].split(" ")[0]
+        ref_intent = ref_lines[line_count].split(" ")[0]
+        hyp_intent_arr.append(hyp_intent)
+        ref_intent_arr.append(ref_intent)
+    print(classification_report(ref_intent_arr, hyp_intent_arr))
+    return f1_score(ref_intent_arr, hyp_intent_arr, average="macro")
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--exp_root", required=True, help="Directory to save experiments")
+parser.add_argument(
+    "--valid_folder",
+    default="inference_asr_model_valid.acc.ave_10best/devel/",
+    help="Directory inside exp_root containing inference on valid set",
+)
+parser.add_argument(
+    "--test_folder",
+    default="inference_asr_model_valid.acc.ave_10best/test/",
+    help="Directory inside exp_root containing inference on test set",
+)
+parser.add_argument(
+    "--utterance_test_folder",
+    default=None,
+    help="Directory inside exp_root containing inference on utterance test set",
+)
+
+args = parser.parse_args()
+
+exp_root = args.exp_root
+valid_inference_folder = args.valid_folder
+test_inference_folder = args.test_folder
+
+valid_hyp_file = open(
+    os.path.join(exp_root, valid_inference_folder + "score_wer/hyp.trn")
+)
+valid_ref_file = open(
+    os.path.join(exp_root, valid_inference_folder + "score_wer/ref.trn")
+)
+
+result = get_classification_result(valid_hyp_file, valid_ref_file)
+print("Valid Macro F1")
+print(result)
+
+test_hyp_file = open(
+    os.path.join(exp_root, test_inference_folder + "score_wer/hyp.trn")
+)
+test_ref_file = open(
+    os.path.join(exp_root, test_inference_folder + "score_wer/ref.trn")
+)
+
+result = get_classification_result(test_hyp_file, test_ref_file)
+print("Test Intent Macro F1")
+print(result)
+
+if args.utterance_test_folder is not None:
+    utt_test_inference_folder = args.utterance_test_folder
+    utt_test_hyp_file = open(
+        os.path.join(exp_root, utt_test_inference_folder + "score_wer/hyp.trn")
+    )
+    utt_test_ref_file = open(
+        os.path.join(exp_root, utt_test_inference_folder + "score_wer/ref.trn")
+    )
+    result = get_classification_result(utt_test_hyp_file, utt_test_ref_file)
+    print("Unseen Utterance Test Macro F1")
+    print(result)
diff --git a/egs2/slue-voxceleb/asr1/local/generate_asr_files.py b/egs2/slue-voxceleb/asr1/local/generate_asr_files.py
new file mode 100644
index 00000000000..dd8a4645410
--- /dev/null
+++ b/egs2/slue-voxceleb/asr1/local/generate_asr_files.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env bash
+
+# Copyright 2021  Siddhant Arora
+#           2021  Carnegie Mellon University
+# Apache 2.0
+
+
+import os
+import re
+import sys
+import pandas as pd
+import argparse
+
+
+def generate_asr_files(txt_file, transcript_file):
+    line_arr = [line for line in txt_file]
+    for line in line_arr:
+        if len(line.split("\t")) > 2:
+            print(line)
+            exit()
+        if len(line.split("\t")[0].split()) == 1:
+            text = "<blank>"
+        else:
+            text = line.split("\t")[0].split()[1].replace("▁", "")
+        for sub_word in line.split("\t")[0].split()[2:]:
+            if "▁" in sub_word:
+                text = text + " " + sub_word.replace("▁", "")
+            else:
+                text = text + sub_word
+        if len(text) == 0:
+            text = "<blank>"
+        wav_name = line.split("\t")[1]
+        transcript_file.write(text + "\t" + wav_name)
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--exp_root", required=True, help="Directory to save experiments")
+parser.add_argument(
+    "--valid_folder",
+    default="inference_asr_model_valid.acc.ave_10best/devel/",
+    help="Directory inside exp_root containing inference on valid set",
+)
+parser.add_argument(
+    "--test_folder",
+    default="inference_asr_model_valid.acc.ave_10best/test/",
+    help="Directory inside exp_root containing inference on test set",
+)
+parser.add_argument(
+    "--utterance_test_folder",
+    default=None,
+    help="Directory inside exp_root containing inference on utterance test set",
+)
+
+args = parser.parse_args()
+
+exp_root = args.exp_root
+valid_inference_folder = args.valid_folder
+test_inference_folder = args.test_folder
+
+valid_hyp_file = open(
+    os.path.join(exp_root, valid_inference_folder + "score_wer/hyp.trn")
+)
+valid_ref_file = open(
+    os.path.join(exp_root, valid_inference_folder + "score_wer/ref.trn")
+)
+
+valid_hyp_write_file = open(
+    os.path.join(exp_root, valid_inference_folder + "score_wer/hyp_asr.trn"), "w"
+)
+valid_ref_write_file = open(
+    os.path.join(exp_root, valid_inference_folder + "score_wer/ref_asr.trn"), "w"
+)
+
+generate_asr_files(valid_hyp_file, valid_hyp_write_file)
+
+generate_asr_files(valid_ref_file, valid_ref_write_file)
+
+
+test_hyp_file = open(
+    os.path.join(exp_root, test_inference_folder + "score_wer/hyp.trn")
+)
+test_ref_file = open(
+    os.path.join(exp_root, test_inference_folder + "score_wer/ref.trn")
+)
+test_hyp_write_file = open(
+    os.path.join(exp_root, test_inference_folder + "score_wer/hyp_asr.trn"), "w"
+)
+test_ref_write_file = open(
+    os.path.join(exp_root, test_inference_folder + "score_wer/ref_asr.trn"), "w"
+)
+
+generate_asr_files(test_hyp_file, test_hyp_write_file)
+
+generate_asr_files(test_ref_file, test_ref_write_file)
+
+
+if args.utterance_test_folder is not None:
+    utt_test_inference_folder = args.utterance_test_folder
+    utt_test_hyp_file = open(
+        os.path.join(exp_root, utt_test_inference_folder + "score_wer/hyp.trn")
+    )
+    utt_test_ref_file = open(
+        os.path.join(exp_root, utt_test_inference_folder + "score_wer/ref.trn")
+    )
+    utt_test_hyp_write_file = open(
+        os.path.join(exp_root, utt_test_inference_folder + "score_wer/hyp_asr.trn"), "w"
+    )
+    utt_test_ref_write_file = open(
+        os.path.join(exp_root, utt_test_inference_folder + "score_wer/ref_asr.trn"), "w"
+    )
+    generate_asr_files(utt_test_hyp_file, utt_test_hyp_write_file)
+
+    generate_asr_files(utt_test_ref_file, utt_test_ref_write_file)
diff --git a/egs2/slue-voxceleb/asr1/local/run_spm.sh b/egs2/slue-voxceleb/asr1/local/run_spm.sh
index 1202a4942f7..9710cf5601e 100755
--- a/egs2/slue-voxceleb/asr1/local/run_spm.sh
+++ b/egs2/slue-voxceleb/asr1/local/run_spm.sh
@@ -2,7 +2,7 @@
 # It takes the data prepared using token type word as input
 # It then trains a bpe model with "nbpe" number of tokens on the train transcript i.e. text after first word (intent)
 # It then encodes the transcript for train, valid and test using the trained bpe model 
-nbpe=500 #try 100, 500, 1000
+nbpe=1000 #try 100, 500, 1000
 bpemode=bpe #try unigram, bpe
 
 new_data=data_${bpemode}_${nbpe}
diff --git a/egs2/slue-voxceleb/asr1/local/score.sh b/egs2/slue-voxceleb/asr1/local/score.sh
index 5b59d6fb3c5..e9c5e002a19 100755
--- a/egs2/slue-voxceleb/asr1/local/score.sh
+++ b/egs2/slue-voxceleb/asr1/local/score.sh
@@ -8,6 +8,9 @@
 # data=data/eval2000
 # #end configuration section.
 
+# TODO(siddhana): Automatically determine the decoding folder name
+# TODO(siddhana): Show SLU results in RESULTS.md
+
 [ -f ./path.sh ] && . ./path.sh
 . parse_options.sh || exit 1;
 
@@ -22,9 +25,29 @@ if [ $# -gt 1 ]; then
 	valid_inference_folder=$2
 	test_inference_folder=$3
 	python local/score.py --exp_root ${asr_expdir} --valid_folder ${valid_inference_folder} --test_folder ${test_inference_folder}
+	python local/generate_asr_files.py --exp_root ${asr_expdir} --valid_folder ${valid_inference_folder} --test_folder ${test_inference_folder}
+	python local/f1_score.py --exp_root ${asr_expdir} --valid_folder ${valid_inference_folder} --test_folder ${test_inference_folder}
 else
+	valid_inference_folder="inference_asr_model_valid.acc.ave_10best/devel/"
+	test_inference_folder="inference_asr_model_valid.acc.ave_10best/test/"
 	python local/score.py --exp_root ${asr_expdir}
+	python local/generate_asr_files.py --exp_root ${asr_expdir}
+	python local/f1_score.py --exp_root ${asr_expdir}
 fi
 
+sclite \
+            -r "${asr_expdir}/${valid_inference_folder}/score_wer/ref_asr.trn" trn \
+            -h "${asr_expdir}/${valid_inference_folder}/score_wer/hyp_asr.trn" trn \
+            -i rm -o all stdout > "${asr_expdir}/${valid_inference_folder}/score_wer/result_asr.txt"
+echo "Write ASR result in ${asr_expdir}/${valid_inference_folder}/score_wer/result_asr.txt"
+                grep -e Avg -e SPKR -m 2 "${asr_expdir}/${valid_inference_folder}/score_wer/result_asr.txt"
+
+sclite \
+            -r "${asr_expdir}/${test_inference_folder}/score_wer/ref_asr.trn" trn \
+            -h "${asr_expdir}/${test_inference_folder}/score_wer/hyp_asr.trn" trn \
+            -i rm -o all stdout > "${asr_expdir}/${test_inference_folder}/score_wer/result_asr.txt"
+echo "Write ASR result in ${asr_expdir}/${test_inference_folder}/score_wer/result_asr.txt"
+                grep -e Avg -e SPKR -m 2 "${asr_expdir}/${test_inference_folder}/score_wer/result_asr.txt"
+
 exit 0
 
diff --git a/egs2/slue-voxceleb/asr1/run.sh b/egs2/slue-voxceleb/asr1/run.sh
index 3ab3911cb4d..e70f5f0a6ba 100755
--- a/egs2/slue-voxceleb/asr1/run.sh
+++ b/egs2/slue-voxceleb/asr1/run.sh
@@ -18,6 +18,7 @@ asr_config=conf/train_asr.yaml
     --nbpe 5000 \
     --token_type word\
     --feats_type raw\
+    --gpu_inference true\
     --max_wav_duration 30 \
     --feats_normalize utterance_mvn\
     --inference_nj 8 \
diff --git a/egs2/vctk/asr1/RESULTS.md b/egs2/vctk/asr1/RESULTS.md
new file mode 100644
index 00000000000..184887ca406
--- /dev/null
+++ b/egs2/vctk/asr1/RESULTS.md
@@ -0,0 +1,59 @@
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+# RESULTS
+
+## Speaker closed setting (based on TTS's data preparation)
+- https://datashare.ed.ac.uk/handle/10283/3443
+- About 80% of the transcription in the evaluation data is covered by the training data in speaker closed condition.
+- Pre-trained model: https://huggingface.co/espnet/YosukeKashiwagi_vctk_asr_train_asr_transformer/tree/main/speaker_closed
+
+### Environments
+- date: `Thu Mar 10 09:51:35 EST 2022`
+- python version: `3.9.5 (default, Jun  4 2021, 12:28:51)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.10.1+cu102`
+- Git hash: `a3e1543e96c1088bfed846d5c68c6f444a55aa75`
+  - Commit date: `Mon Feb 14 13:28:05 2022 -0500`
+
+### asr_train_asr_transformer_raw_char_sp
+#### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/dev|540|3657|95.2|4.0|0.7|0.6|5.4|18.7|
+|decode_asr_asr_model_valid.acc.ave/eval1|540|3760|95.6|3.6|0.8|0.5|4.9|19.4|
+
+#### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/dev|540|19387|98.2|0.8|1.0|0.5|2.2|18.7|
+|decode_asr_asr_model_valid.acc.ave/eval1|540|19819|98.4|0.7|0.9|0.5|2.1|19.4|
+
+## Speaker open setting (based on noisy-vctk's data preparation)
+- https://datashare.ed.ac.uk/handle/10283/2791
+- About 90% of the transcription in the evaluation data is covered by the training data in speaker open condition.
+- Pre-trained model: https://huggingface.co/espnet/YosukeKashiwagi_vctk_asr_train_asr_transformer/tree/main/speaker_open
+
+## RESULTS
+### Environments
+- date: `Thu Mar 10 09:50:28 EST 2022`
+- python version: `3.9.5 (default, Jun  4 2021, 12:28:51)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.10.1+cu102`
+- Git hash: `a3e1543e96c1088bfed846d5c68c6f444a55aa75`
+  - Commit date: `Mon Feb 14 13:28:05 2022 -0500`
+
+### asr_train_asr_transformer_raw_char_sp
+#### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/dev|770|5804|92.3|6.8|0.9|1.0|8.7|32.7|
+|decode_asr_asr_model_valid.acc.ave/eval1|824|6221|82.4|14.5|3.0|1.4|19.0|59.2|
+
+#### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/dev|770|29970|97.4|1.1|1.4|0.6|3.2|32.7|
+|decode_asr_asr_model_valid.acc.ave/eval1|824|32785|93.2|2.5|4.3|0.9|7.7|59.2|
\ No newline at end of file
diff --git a/egs2/vctk/asr1/asr.sh b/egs2/vctk/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/vctk/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/vctk/asr1/cmd.sh b/egs2/vctk/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/vctk/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/vctk/asr1/conf/decode_asr.yaml b/egs2/vctk/asr1/conf/decode_asr.yaml
new file mode 100644
index 00000000000..58a3dcf022b
--- /dev/null
+++ b/egs2/vctk/asr1/conf/decode_asr.yaml
@@ -0,0 +1,6 @@
+lm_weight: 0.0
+beam_size: 20
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.3
diff --git a/egs2/vctk/asr1/conf/fbank.conf b/egs2/vctk/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/vctk/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/vctk/asr1/conf/pbs.conf b/egs2/vctk/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/vctk/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/vctk/asr1/conf/pitch.conf b/egs2/vctk/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/vctk/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/vctk/asr1/conf/queue.conf b/egs2/vctk/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/vctk/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/vctk/asr1/conf/slurm.conf b/egs2/vctk/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/vctk/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/vctk/asr1/conf/train_asr_transformer.yaml b/egs2/vctk/asr1/conf/train_asr_transformer.yaml
new file mode 100644
index 00000000000..8958728c610
--- /dev/null
+++ b/egs2/vctk/asr1/conf/train_asr_transformer.yaml
@@ -0,0 +1,62 @@
+batch_type: numel
+batch_bins: 16000000
+accum_grad: 4
+max_epoch: 200
+patience: none
+# The initialization method for model parameters
+init: xavier_uniform
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+encoder: transformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d
+    normalize_before: true
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.002
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 25000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/vctk/asr1/db.sh b/egs2/vctk/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/vctk/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/vctk/asr1/local/data.sh b/egs2/vctk/asr1/local/data.sh
new file mode 100755
index 00000000000..f7af83950a1
--- /dev/null
+++ b/egs2/vctk/asr1/local/data.sh
@@ -0,0 +1,87 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+stage=-1
+stop_stage=2
+use_speakeropen=false
+
+help_message=$(cat << EOF
+Usage: $0 
+  optional argument:
+    None
+EOF
+)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+
+log "$0 $*"
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    echo "${help_message}"
+    exit 1;
+fi
+
+if "${use_speakeropen}"; then
+
+    if [ ! -e "${NOISY_SPEECH}" ] ; then
+        log "
+        Please fill the value of 'NOISY_SPEECH' in db.sh
+        The 'NOISY_SPEECH' (https://doi.org/10.7488/ds/2117) directory
+        should at least contain the clean speech and the clean text:
+            noisy_speech
+            ├── clean_testset_wav
+            ├── clean_trainset_28spk_wav
+            ├── testset_txt
+            └── trainset_28spk_txt
+        "
+	exit 1
+    fi
+
+    if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+	log "stage 0: local/data_prep_speaker_open.sh"
+	# Initial normalization of the data
+	# Doesn't change sampling frequency and it's done after stages
+    local/data_prep_speaker_open.sh  ${NOISY_SPEECH} || exit 1;
+    fi
+
+else
+    
+    if [ -z "${VCTK}" ]; then
+	log "Please fill the value of 'VCTK' of db.sh"
+	exit 1
+    fi
+    db_root=${VCTK}
+    
+    train_set=tr_no_dev
+    dev_set=dev
+    eval_set=eval1
+
+    if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+	log "stage -1: Data Download"
+	local/data_download.sh "${db_root}"
+    fi
+    
+    if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+	log "stage 0: local/data_prep_speaker_closed.sh"
+	# Initial normalization of the data
+	# Doesn't change sampling frequency and it's done after stages
+	local/data_prep_speaker_closed.sh \
+            --train_set "${train_set}" \
+            --dev_set "${dev_set}" \
+            --eval_set "${eval_set}" \
+            "${db_root}"/VCTK-Corpus
+    fi
+
+fi
diff --git a/egs2/vctk/asr1/local/data_download.sh b/egs2/vctk/asr1/local/data_download.sh
new file mode 120000
index 00000000000..da6b5a37427
--- /dev/null
+++ b/egs2/vctk/asr1/local/data_download.sh
@@ -0,0 +1 @@
+../../tts1/local/data_download.sh
\ No newline at end of file
diff --git a/egs2/vctk/asr1/local/data_prep_speaker_closed.sh b/egs2/vctk/asr1/local/data_prep_speaker_closed.sh
new file mode 120000
index 00000000000..1c53d10ee72
--- /dev/null
+++ b/egs2/vctk/asr1/local/data_prep_speaker_closed.sh
@@ -0,0 +1 @@
+../../tts1/local/data_prep.sh
\ No newline at end of file
diff --git a/egs2/vctk/asr1/local/data_prep_speaker_open.sh b/egs2/vctk/asr1/local/data_prep_speaker_open.sh
new file mode 100755
index 00000000000..b0abe73a17d
--- /dev/null
+++ b/egs2/vctk/asr1/local/data_prep_speaker_open.sh
@@ -0,0 +1,82 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+
+if [ $# -ne 1 ]; then
+  echo "Arguments should be NOISY_SPEECH wav path, see local/data.sh for example."
+  exit 1;
+fi
+
+NOISY_SPEECH=$1
+# check if the wav dirs exist.
+
+for ddir in clean_trainset_28spk_wav clean_testset_wav trainset_28spk_txt testset_txt; do
+  f=${NOISY_SPEECH}/${ddir}
+  if [ ! -d $f ]; then
+    echo "Error: $f is not a directory."
+    exit 1;
+  fi
+done
+
+data=./data
+rm -r ${data}/tr_26spk 2>/dev/null || true
+rm -r ${data}/{cv, tt}_2spk 2>/dev/null || true
+
+tmpdir=data/temp
+rm -r  $tmpdir 2>/dev/null || true
+mkdir -p $tmpdir 
+
+train_dir=${NOISY_SPEECH}/clean_trainset_28spk_wav
+test_dir=${NOISY_SPEECH}/clean_testset_wav
+
+echo "Building training and testing data"
+
+find $train_dir -name '*.wav' -not -name 'p226_*.wav' -not -name 'p287_*.wav' | sort -u > $tmpdir/tr_no_dev.flist
+find $train_dir -name 'p226_*.wav' -o -name 'p287_*.wav' | sort -u > $tmpdir/dev.flist
+find $test_dir -name '*.wav' | sort -u > $tmpdir/eval1.flist
+
+
+for x in tr_no_dev dev eval1; do
+
+  if [ "${x}" == "tr_no_dev" -o "${x}" == "dev" ]; then
+      text_dir=${NOISY_SPEECH}/trainset_28spk_txt
+  else
+      text_dir=${NOISY_SPEECH}/testset_txt
+  fi      
+
+  sed -e 's:.*p\([0-9]*\)_\([0-9]*\).wav$:p\1_\2:i' $tmpdir/${x}.flist \
+  > $tmpdir/${x}.uttids
+
+  paste $tmpdir/${x}.uttids $tmpdir/${x}.flist \
+  | sort -k1,1 >  $tmpdir/${x}.scp
+  mkdir -p ${data}/${x}
+  cp $tmpdir/${x}.scp ${data}/${x}/wav.scp
+  
+  awk '{split($1, lst, "_"); spk=lst[1]; print($1, spk)}' ${data}/${x}/wav.scp | \
+    sort -u> ${data}/${x}/utt2spk
+  utt2spk_to_spk2utt.pl ${data}/${x}/utt2spk > ${data}/${x}/spk2utt
+
+  cat $tmpdir/${x}.uttids | \
+      while read uttid;
+      do
+	  if [ ! -f ${text_dir}/${uttid}.txt ]; then
+	      echo "missing text file for ${uttid}" 1>&2
+	      exit 1;
+	  fi
+	  echo "${uttid}" $(<${text_dir}/${uttid}.txt)
+      done | \
+	  sort -u > ${data}/${x}/text
+
+  sed -e "s#noisy_#clean_#g" ${data}/${x}/wav.scp \
+    > ${data}/${x}/spk1.scp
+done
+
+
+
diff --git a/egs2/vctk/asr1/local/path.sh b/egs2/vctk/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/vctk/asr1/path.sh b/egs2/vctk/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/vctk/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/vctk/asr1/pyscripts b/egs2/vctk/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/vctk/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/vctk/asr1/run.sh b/egs2/vctk/asr1/run.sh
new file mode 100755
index 00000000000..d363dc3a8a4
--- /dev/null
+++ b/egs2/vctk/asr1/run.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+# if ture, speaker open setting will be used.
+use_speakeropen=true
+train_set=tr_no_dev
+valid_set=dev
+test_sets="dev eval1"
+
+token_type=char
+
+asr_config=conf/train_asr_transformer.yaml
+inference_config=conf/decode_asr.yaml
+
+# speed perturbation related
+# (train_set will be "${train_set}_sp" if speed_perturb_factors is specified)
+speed_perturb_factors="1.1 0.9 1.0"
+
+./asr.sh \
+    --ngpu 4 \
+    --token_type "${token_type}" \
+    --feats_type raw \
+    --fs 16k \
+    --local_data_opts "--use_speakeropen ${use_speakeropen}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --use_lm false \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --speed_perturb_factors "${speed_perturb_factors}" \
+    --lm_train_text "data/${train_set}/text" "$@"
diff --git a/egs2/vctk/asr1/scripts b/egs2/vctk/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/vctk/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/vctk/asr1/steps b/egs2/vctk/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/vctk/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/vctk/asr1/utils b/egs2/vctk/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/vctk/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/wsj/asr1/conf/tuning/inference_asr_maskctc.yaml b/egs2/wsj/asr1/conf/tuning/inference_asr_maskctc.yaml
new file mode 100644
index 00000000000..7621cc71599
--- /dev/null
+++ b/egs2/wsj/asr1/conf/tuning/inference_asr_maskctc.yaml
@@ -0,0 +1,7 @@
+# Add the following options for running maskctc inference
+#   --inference_asr_model valid.acc_mlm.ave.pth
+#   --use_maskctc true
+# To run CTC greedy decoding, set maskctc_n_iterations to 1
+# and maskctc_threshold_probability to 0.0
+maskctc_n_iterations: 10
+maskctc_threshold_probability: 0.999
diff --git a/egs2/wsj/asr1/conf/tuning/train_asr_transformer_maskctc.yaml b/egs2/wsj/asr1/conf/tuning/train_asr_transformer_maskctc.yaml
new file mode 100644
index 00000000000..8f5204bef97
--- /dev/null
+++ b/egs2/wsj/asr1/conf/tuning/train_asr_transformer_maskctc.yaml
@@ -0,0 +1,65 @@
+batch_type: folded
+batch_size: 32
+accum_grad: 8
+max_epoch: 100
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc_mlm
+    - max
+keep_nbest_models: 10
+
+model: maskctc
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+encoder: transformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d
+    normalize_before: true
+
+decoder: mlm
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+optim: adam
+optim_conf:
+    lr: 0.002
+    weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 15000
+
+num_att_plot: 0
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 27
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_ratio_range:
+    - 0.
+    - 0.05
+    num_time_mask: 5
\ No newline at end of file
diff --git a/egs2/wsj0_2mix/enh1/README.md b/egs2/wsj0_2mix/enh1/README.md
index a88f7759ae3..0226a964cee 100644
--- a/egs2/wsj0_2mix/enh1/README.md
+++ b/egs2/wsj0_2mix/enh1/README.md
@@ -133,4 +133,20 @@
 |dataset|PESQ|STOI|SAR|SDR|SIR|SI_SNR|
 |---|---|---|---|---|---|---|
 |enhanced_cv_min_8k|2.68|0.88|12.28|11.01|18.03|10.48|
-|enhanced_tt_min_8k|2.68|0.89|12.10|10.84|17.98|10.30|
\ No newline at end of file
+|enhanced_tt_min_8k|2.68|0.89|12.10|10.84|17.98|10.30|
+- date: `Thu Mar  3 14:29:20 CST 2022`
+- python version: `3.8.12 (default, Oct 12 2021, 13:49:34)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.10.2+cu102`
+- Git hash: `9c24b3adddbde3402530080cb58ae08a6f4dd642`
+  - Commit date: `Wed Feb 23 14:49:15 2022 -0500`
+
+
+## DC-CRN complex spectral mapping (SNR loss)
+
+config: conf/tuning/train_enh_dc_crn_mapping_snr.yaml
+
+|dataset|STOI|SAR|SDR|SIR|SI_SNR|
+|---|---|---|---|---|---|
+|enhanced_cv_min_8k|0.93|14.86|14.16|24.02|13.73|
+|enhanced_tt_min_8k|0.94|14.25|13.46|23.13|13.01|
diff --git a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_dc_crn_mapping_snr.yaml b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_dc_crn_mapping_snr.yaml
new file mode 100644
index 00000000000..64cc661070a
--- /dev/null
+++ b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_dc_crn_mapping_snr.yaml
@@ -0,0 +1,66 @@
+init: xavier_uniform
+max_epoch: 200
+batch_type: folded
+batch_size:  16
+iterator_type: chunk
+chunk_length: 32000
+num_workers: 4
+optim: adam
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 1.0e-7
+    amsgrad: true
+patience: 10
+grad_clip: 5
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: steplr
+scheduler_conf:
+    step_size: 2
+    gamma: 0.98
+
+# A list for criterions
+# The overlall loss in the multi-task learning will be:
+# loss = weight_1 * loss_1 + ... + weight_N * loss_N
+# The default `weight` for each sub-loss is 1.0
+criterions: 
+  # The first criterion
+  - name: si_snr 
+    conf:
+      eps: 1.0e-7
+    # the wrapper for the current criterion
+    # PIT is widely used in the speech separation task
+    wrapper: pit
+    wrapper_conf:
+      weight: 1.0
+
+
+encoder: stft
+encoder_conf:
+    n_fft: 256
+    hop_length: 128
+decoder: stft
+decoder_conf:
+    n_fft: 256
+    hop_length: 128
+separator: dc_crn
+separator_conf:
+    num_spk: 2
+    input_channels: [2, 16, 32, 64, 128, 256]
+    enc_hid_channels: 8
+    enc_layers: 5
+    glstm_groups: 2
+    glstm_layers: 2
+    glstm_bidirectional: true
+    glstm_rearrange: false
+    mode: mapping
diff --git a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_dpcl_e2e.yaml b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_dpcl_e2e.yaml
new file mode 100644
index 00000000000..807e86b7b17
--- /dev/null
+++ b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_dpcl_e2e.yaml
@@ -0,0 +1,65 @@
+optim: adam
+init: xavier_uniform
+max_epoch: 100
+batch_type: folded
+batch_size:  8
+num_workers: 4
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 1.0e-7
+patience: 10
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: reducelronplateau
+scheduler_conf:
+    mode: min
+    factor: 0.7
+    patience: 1
+
+# A list for criterions
+# The overlall loss in the multi-task learning will be:
+# loss = weight_1 * loss_1 + ... + weight_N * loss_N
+# The default `weight` for each sub-loss is 1.0
+criterions: 
+  # The first criterion
+  - name: mse
+    conf:
+      compute_on_mask: False
+      mask_type: PSM
+    # the wrapper for the current criterion
+    # PIT is widely used in the speech separation task
+    wrapper: pit
+    wrapper_conf:
+      weight: 1.0
+
+encoder: stft
+encoder_conf:
+    n_fft: 256
+    hop_length: 128
+decoder: stft
+decoder_conf:
+    n_fft: 256
+    hop_length: 128
+separator: dpcl_e2e
+separator_conf:
+    rnn_type: blstm
+    num_spk: 2
+    nonlinear: relu
+    layer: 2
+    unit: 500
+    dropout: 0.1
+    emb_D: 40
+    alpha: 5.0
+    max_iteration: 100
+
+
diff --git a/egs2/zh_openslr38/asr1/README.md b/egs2/zh_openslr38/asr1/README.md
new file mode 100644
index 00000000000..e646b431596
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/README.md
@@ -0,0 +1,39 @@
+# Corpus
+**Free ST Chinese Mandarin Corpus**: a free Mandarin Chinese corpus collected by Surfingtech (www.surfing.ai). The dataset contains 102600 utterances from 855 speakers, for a total of 109.73 hours of speech. 
+
+Since all speakers have 120 utterances, we manually divide the data into train, dev, and test split with a ratio of 90-5-5 using speaker IDs, resulting in 769, 43, and 43 speakers in our train, dev, test split respectively. Utterances with the same speaker ID are kept in the same split.
+
+The original dataset contains duplicates sentences with the same transcript, but are spoken by different speakers. Although the waveforms are different for these duplicates, we still remove sentences in the test and development set that have duplicate transcripts in the training set, in order to eliminate any effect of training data leakage.
+
+Link: https://openslr.org/38
+
+# Results
+## Environments
+- python version: `3.9.10 | packaged by conda-forge | (main, Feb  1 2022, 21:24:11)  [GCC 9.4.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.10.1`
+- pretrained model: https://huggingface.co/espnet/zh_openslr38/blob/main/exp/asr_train_asr_conformer_raw_zh_char_sp/valid.acc.ave_10best.pth
+
+## Spectrum Features
+Code to reproduce:
+```./run.sh```
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_rnn_lm_lm_train_lm_transformer_zh_char_valid.loss.ave_asr_model_valid.acc.ave/dev|4322|46490|91.0|8.4|0.5|0.2|9.2|51.5|
+|decode_asr_rnn_lm_lm_train_lm_transformer_zh_char_valid.loss.ave_asr_model_valid.acc.ave/test|4167|45803|91.1|8.5|0.5|0.2|9.1|52.2|
+
+## HuBERT Self-Supervised Learning (SSLR)
+We provide the script to train with SSLR features via HuBERT. Due to the much longer training time with HuBERT, we only train for 24 epochs. The model does not show a lower CER over spectrum features, but training for longer may lead to improved results.
+
+Code to reproduce:
+```./local/run_sslr.sh```
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_zh_char_valid.loss.ave_asr_model_valid.acc.best/dev|4322|46490|90.8|8.6|0.6|0.2|9.4|51.9|
+|decode_asr_lm_lm_train_lm_zh_char_valid.loss.ave_asr_model_valid.acc.best/test|4167|45803|90.8|8.7|0.5|0.2|9.4|54.1|
diff --git a/egs2/zh_openslr38/asr1/asr.sh b/egs2/zh_openslr38/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/zh_openslr38/asr1/cmd.sh b/egs2/zh_openslr38/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/zh_openslr38/asr1/conf/decode_asr.yaml b/egs2/zh_openslr38/asr1/conf/decode_asr.yaml
new file mode 100644
index 00000000000..88fdbc20b91
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/conf/decode_asr.yaml
@@ -0,0 +1,6 @@
+beam_size: 20
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.6
+lm_weight: 0.3
diff --git a/egs2/zh_openslr38/asr1/conf/fbank.conf b/egs2/zh_openslr38/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/zh_openslr38/asr1/conf/pbs.conf b/egs2/zh_openslr38/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/zh_openslr38/asr1/conf/pitch.conf b/egs2/zh_openslr38/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/zh_openslr38/asr1/conf/queue.conf b/egs2/zh_openslr38/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/zh_openslr38/asr1/conf/slurm.conf b/egs2/zh_openslr38/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/zh_openslr38/asr1/conf/train_asr.yaml b/egs2/zh_openslr38/asr1/conf/train_asr.yaml
new file mode 100644
index 00000000000..98588892b1c
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/conf/train_asr.yaml
@@ -0,0 +1,76 @@
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    macaron_style: true
+    use_cnn_module: true
+    cnn_module_kernel: 15
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+# minibatch related
+batch_type: numel
+batch_bins: 4000000
+
+# optimization related
+accum_grad: 4
+grad_clip: 5
+max_epoch: 40
+val_scheduler_criterion:
+    - valid
+    - acc
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+   lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 30000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
\ No newline at end of file
diff --git a/egs2/zh_openslr38/asr1/conf/train_lm.yaml b/egs2/zh_openslr38/asr1/conf/train_lm.yaml
new file mode 100644
index 00000000000..6f12611bf06
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/conf/train_lm.yaml
@@ -0,0 +1,29 @@
+lm: transformer
+lm_conf:
+    pos_enc: null
+    embed_unit: 128
+    att_unit: 512
+    head: 8
+    unit: 2048
+    layer: 16
+    dropout_rate: 0.1
+
+# optimization related
+grad_clip: 5.0
+batch_type: numel
+batch_bins: 2000000
+accum_grad: 1
+max_epoch: 15  # 15epoch is enougth
+
+optim: adam
+optim_conf:
+   lr: 0.001
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 25000
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 10  # 10 is good.
\ No newline at end of file
diff --git a/egs2/zh_openslr38/asr1/conf/tuning/train_asr_sslr.yaml b/egs2/zh_openslr38/asr1/conf/tuning/train_asr_sslr.yaml
new file mode 100644
index 00000000000..03a410cded5
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/conf/tuning/train_asr_sslr.yaml
@@ -0,0 +1,89 @@
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    macaron_style: true
+    use_cnn_module: true
+    cnn_module_kernel: 15
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+# minibatch related
+batch_type: numel
+batch_bins: 4000000
+
+# optimization related
+accum_grad: 4
+grad_clip: 5
+max_epoch: 40
+val_scheduler_criterion:
+    - valid
+    - acc
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+   lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 30000
+
+frontend: s3prl
+frontend_conf:
+   frontend_conf:
+      upstream: hubert_large_ll60k  # Note: If the upstream is changed, please change the input_size in the preencoder.
+   download_dir: ./hub
+   multilayer_feature: True
+   
+preencoder: linear
+preencoder_conf:
+   input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+   output_size: 80
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/zh_openslr38/asr1/db.sh b/egs2/zh_openslr38/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/zh_openslr38/asr1/local/check_train_test_duplicate.py b/egs2/zh_openslr38/asr1/local/check_train_test_duplicate.py
new file mode 100644
index 00000000000..3c61d786c1d
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/local/check_train_test_duplicate.py
@@ -0,0 +1,61 @@
+from collections import Counter
+
+train_file = "data/train/text"
+
+train_lines = []
+with open(train_file) as f:
+    for line in f:
+        if not line:
+            continue
+        train_lines.append(line.split()[1])
+train_lines = set(train_lines)
+
+for test_name in ("test", "dev"):
+    test_file = f"data/{test_name}/text"
+
+    test_lines = []
+    test_uttids = []
+    with open(test_file) as f:
+        for line in f:
+            if not line:
+                continue
+            test_uttids.append(line.split()[0])
+            test_lines.append(line.split()[1])
+
+    count = 0
+    duplicate_uttids = []  # duplicate ids in the test file
+    for t, uttid in zip(test_lines, test_uttids):
+        if t in train_lines:
+            duplicate_uttids.append(uttid)
+            count += 1
+    duplicate_uttids = set(duplicate_uttids)
+    print(count, "duplicates in", test_name)
+
+    # if input("continue? [y/n]") == 'y':
+    # remove all instances of duplicate uttids in: spk2utt, text, utt2spk, wav.scp
+    with open(f"data/{test_name}/spk2utt", "r") as f:
+        # replace all uttid with empty string
+        text = f.read()
+        for uttid in duplicate_uttids:
+            text = text.replace(" " + uttid, "")
+        for line in text.split("\n"):
+            if not line:
+                continue
+            if len(line.strip().split(" ")) < 2:
+                print(f"removing {line} from spk2utt")
+                text = text.replace(line + "\n", "")
+    with open(f"data/{test_name}/spk2utt", "w") as f:
+        f.write(text)
+
+    for name in ("text", "utt2spk", "wav.scp"):
+        with open(f"data/{test_name}/{name}", "r") as f:
+            # remove all lines that contain ids that correspond to duplicate sentences
+            out_lines = []
+            for line in f:
+                if not line.split()[0] in duplicate_uttids:
+                    out_lines.append(line.strip())
+        with open(f"data/{test_name}/{name}", "w") as f:
+            f.write("\n".join(out_lines))
+            f.write("\n")
+    # else:
+    #     print("ok.")
diff --git a/egs2/zh_openslr38/asr1/local/data.sh b/egs2/zh_openslr38/asr1/local/data.sh
new file mode 100755
index 00000000000..a8ae818556d
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/local/data.sh
@@ -0,0 +1,84 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+help_message=$(cat << EOF
+Usage: $0
+
+Options:
+    --remove_archive (bool): true or false
+      With remove_archive=True, the archives will be removed after being successfully downloaded and un-tarred.
+EOF
+)
+SECONDS=0
+
+# Data preparation related
+data_url=www.openslr.org/resources/38
+remove_archive=false
+download_opt=
+
+log "$0 $*"
+
+
+. ./utils/parse_options.sh
+
+. ./db.sh
+. ./path.sh
+. ./cmd.sh
+
+
+if [ $# -gt 1 ]; then
+  log "${help_message}"
+  exit 2
+fi
+
+if "$remove_archive"; then
+  download_opt="--remove-archive"
+fi
+
+if [ -z "${ST_CMDS}" ]; then
+  log "Error: \$ST_CMDS is not set in db.sh."
+  exit 2
+fi
+
+
+log "Download data to ${ST_CMDS}"
+if [ ! -d "${ST_CMDS}" ]; then
+    mkdir -p "${ST_CMDS}"
+fi
+# To absolute path
+ST_CMDS=$(cd ${ST_CMDS}; pwd)
+
+echo local/data_download.sh ${download_opt} "${ST_CMDS}" "${data_url}" ST-CMDS-20170001_1-OS.tar.gz
+local/data_download.sh ${download_opt} "${ST_CMDS}" "${data_url}" ST-CMDS-20170001_1-OS.tar.gz
+
+log "Data Preparation"
+train_dir=data/train
+dev_dir=data/dev
+test_dir=data/test
+
+mkdir -p $train_dir
+mkdir -p $dev_dir
+mkdir -p $test_dir
+
+python3 local/data_split.py ${ST_CMDS}/ST-CMDS-20170001_1-OS
+
+for dir in $train_dir $dev_dir $test_dir; do
+  utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
+done
+
+python3 local/check_train_test_duplicate.py
+
+# validate formats
+utils/validate_data_dir.sh --no-feats data/train
+utils/validate_data_dir.sh --no-feats data/dev
+utils/validate_data_dir.sh --no-feats data/test
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
\ No newline at end of file
diff --git a/egs2/zh_openslr38/asr1/local/data_download.sh b/egs2/zh_openslr38/asr1/local/data_download.sh
new file mode 100755
index 00000000000..27edd73b27d
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/local/data_download.sh
@@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+
+remove_archive=false
+
+if [ "$1" == --remove-archive ]; then
+  remove_archive=true
+  shift
+fi
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [--remove-archive] <data-base> <url-base>"
+  echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/38"
+  echo "With --remove-archive it will remove the archive after successfully un-tarring it."
+fi
+
+data=$1
+url=$2
+
+if [ ! -d "$data" ]; then
+  echo "$0: no such directory $data"
+  exit 1;
+fi
+
+if [ -z "$url" ]; then
+  echo "$0: empty URL base."
+  exit 1;
+fi
+
+set -euo pipefail
+
+cwd=$(pwd)
+if [ ! -e "${data}/.complete" ]; then
+    mkdir -p "${data}"
+    cd "${data}" || exit 1;
+    wget $url/ST-CMDS-20170001_1-OS.tar.gz
+    tar xf ST-CMDS-20170001_1-OS.tar.gz
+
+    if $remove_archive; then
+        echo "$0: removing $data/ST-CMDS-20170001_1-OS.tar.gz file since --remove-archive option was supplied."
+        rm $data/ST-CMDS-20170001_1-OS.tar.gz
+    fi
+
+    cd "${cwd}" || exit 1;
+    echo "$0: Successfully downloaded and un-tarred $data/ST-CMDS-20170001_1-OS.tar.gz"
+    touch ${data}/.complete
+else
+    echo "$0: Already exists. Skip download."
+fi
+
+exit 0;
diff --git a/egs2/zh_openslr38/asr1/local/data_split.py b/egs2/zh_openslr38/asr1/local/data_split.py
new file mode 100644
index 00000000000..df952d304cd
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/local/data_split.py
@@ -0,0 +1,107 @@
+"""
+Split data to train, dev, test
+"""
+import sys
+import os
+from collections import defaultdict
+import random
+
+train_size = 0.9
+random.seed(1)
+
+data_dir = sys.argv[1]  # ST-CMDS-20170001_1-OS
+
+# create speaker id dictionary
+d = defaultdict(list)
+for fn in os.listdir(data_dir):
+    if not fn.endswith(".wav"):
+        continue
+    # 20170001P00001A0001.wav
+    prefix, s = fn.split("P")
+    try:
+        speaker, s = s.split("A")
+        letter = "A"
+    except ValueError:
+        speaker, s = s.split("I")
+        letter = "I"
+    utt, _ = s.split(".")
+    d[speaker + letter].append(utt)
+
+speaker_ids = list(d.keys())
+random.shuffle(speaker_ids)
+
+num_speakers = len(speaker_ids)
+assert (
+    num_speakers == 855
+), "Number of speakers should be 855 in Free ST Chinese Mandarin Corpus."
+
+num_train = int(train_size * num_speakers)
+num_test = int((num_speakers - num_train) / 2)
+
+train_speakers = speaker_ids[:num_train]
+dev_speakers = speaker_ids[num_train:-num_test]
+test_speakers = speaker_ids[-num_test:]
+
+print(
+    f"# train: {num_train}, # dev:{num_speakers-num_train-num_test}, # test:{num_test}"
+)
+
+
+def get_transcription(spk_id, utt_id):
+    text_fn = get_text_filename(spk_id, utt_id)
+    with open(text_fn) as f:
+        lines = f.readlines()
+    assert len(lines) == 1, f"More than one line in transription file:{text_fn}"
+    return lines[0]
+
+
+def get_text_filename(spk_id, utt_id):
+    return f"{data_dir}/20170001P{spk_id}{utt_id}.txt"
+
+
+def get_wav_filename(spk_id, utt_id):
+    return f"{data_dir}/20170001P{spk_id}{utt_id}.wav"
+
+
+def create_files(speakers, directory):
+    text_lines, scp_lines, utt2spk_lines = [], [], []
+    for spk_id in speakers:
+        for utt_id in d[spk_id]:
+            # add spk_id in front to make utt_id unique
+            unique_utt_id = spk_id + utt_id
+
+            transcription = get_transcription(spk_id, utt_id)
+            text_lines.append(f"{unique_utt_id} {transcription}\n")
+
+            wav_file_path = get_wav_filename(spk_id, utt_id)
+            scp_lines.append(f"{unique_utt_id} {wav_file_path}\n")
+
+            utt2spk_lines.append(f"{unique_utt_id} {spk_id}\n")
+
+    # sort
+    text_lines.sort()
+    scp_lines.sort()
+    utt2spk_lines.sort()
+
+    # write to file
+    with open(f"{directory}/text", "w+") as text_file:
+        text_file.writelines(text_lines)
+
+    with open(f"{directory}/wav.scp", "w+") as scp_file:
+        scp_file.writelines(scp_lines)
+
+    with open(f"{directory}/utt2spk", "w+") as utt2spk_file:
+        utt2spk_file.writelines(utt2spk_lines)
+
+
+print("Creating files for train...", end="")
+create_files(train_speakers, "data/train")
+print("Done.")
+
+print("Creating files for dev...", end="")
+create_files(dev_speakers, "data/dev")
+print("Done.")
+
+print("Creating files for test...", end="")
+create_files(test_speakers, "data/test")
+print("Done.")
diff --git a/egs2/zh_openslr38/asr1/local/path.sh b/egs2/zh_openslr38/asr1/local/path.sh
new file mode 100755
index 00000000000..cd186777d50
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/local/path.sh
@@ -0,0 +1 @@
+MAIN_ROOT=$PWD/../../..
\ No newline at end of file
diff --git a/egs2/zh_openslr38/asr1/local/run_sslr.sh b/egs2/zh_openslr38/asr1/local/run_sslr.sh
new file mode 100755
index 00000000000..0f924c725c5
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/local/run_sslr.sh
@@ -0,0 +1,44 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train
+valid_set=dev
+test_sets="dev test"
+
+asr_config=conf/tuning/train_asr_sslr.yaml
+inference_config=conf/decode_asr.yaml
+
+lm_config=conf/train_lm.yaml
+use_lm=true
+use_wordlm=false
+
+# speed perturbation related
+# (train_set will be "${train_set}_sp" if speed_perturb_factors is specified)
+speed_perturb_factors="0.9 1.0 1.1"
+
+./asr.sh                                               \
+    --lang zh                                          \
+    --audio_format wav                                 \
+    --feats_type raw                                   \
+    --token_type char                                  \
+    --use_lm ${use_lm}                                 \
+    --use_word_lm ${use_wordlm}                        \
+    --lm_config "${lm_config}"                         \
+    --asr_config "${asr_config}"                       \
+    --inference_config "${inference_config}"           \
+    --train_set "${train_set}"                         \
+    --valid_set "${valid_set}"                         \
+    --test_sets "${test_sets}"                         \
+    --speed_perturb_factors "${speed_perturb_factors}" \
+    --asr_speech_fold_length 512 \
+    --asr_text_fold_length 150 \
+    --lm_fold_length 150 \
+    --lm_train_text "data/${train_set}/text" "$@" \
+    --feats_normalize uttmvn \
+    --nj 1 \
+    --inference_asr_model valid.acc.best.pth \
+    --gpu_inference true
diff --git a/egs2/zh_openslr38/asr1/path.sh b/egs2/zh_openslr38/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/zh_openslr38/asr1/pyscripts b/egs2/zh_openslr38/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/zh_openslr38/asr1/run.sh b/egs2/zh_openslr38/asr1/run.sh
new file mode 100755
index 00000000000..8d443a09702
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/run.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train
+valid_set=dev
+test_sets="dev test"
+
+asr_config=conf/train_asr.yaml
+inference_config=conf/decode_asr.yaml
+
+lm_config=conf/train_lm.yaml
+use_lm=true
+use_wordlm=false
+
+# speed perturbation related
+# (train_set will be "${train_set}_sp" if speed_perturb_factors is specified)
+speed_perturb_factors="0.9 1.0 1.1"
+
+./asr.sh                                               \
+    --lang zh                                          \
+    --audio_format wav                                 \
+    --feats_type raw                                   \
+    --token_type char                                  \
+    --use_lm ${use_lm}                                 \
+    --use_word_lm ${use_wordlm}                        \
+    --lm_config "${lm_config}"                         \
+    --asr_config "${asr_config}"                       \
+    --inference_config "${inference_config}"           \
+    --train_set "${train_set}"                         \
+    --valid_set "${valid_set}"                         \
+    --test_sets "${test_sets}"                         \
+    --speed_perturb_factors "${speed_perturb_factors}" \
+    --asr_speech_fold_length 512 \
+    --asr_text_fold_length 150 \
+    --lm_fold_length 150 \
+    --lm_train_text "data/${train_set}/text" "$@"
diff --git a/egs2/zh_openslr38/asr1/scripts b/egs2/zh_openslr38/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/zh_openslr38/asr1/steps b/egs2/zh_openslr38/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/zh_openslr38/asr1/utils b/egs2/zh_openslr38/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/espnet/nets/pytorch_backend/transformer/longformer_attention.py b/espnet/nets/pytorch_backend/transformer/longformer_attention.py
new file mode 100644
index 00000000000..82a54c801d1
--- /dev/null
+++ b/espnet/nets/pytorch_backend/transformer/longformer_attention.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright 2022 Roshan Sharma (Carnegie Mellon University)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Longformer based Local Attention Definition."""
+
+from longformer.longformer import LongformerConfig
+from longformer.longformer import LongformerSelfAttention
+from torch import nn
+
+
+class LongformerAttention(nn.Module):
+    """Longformer based Local Attention Definition."""
+
+    def __init__(self, config: LongformerConfig, layer_id: int):
+        """Compute Longformer based Self-Attention.
+
+        Args:
+            config : Longformer attention configuration
+            layer_id: Integer representing the layer index
+        """
+        super().__init__()
+        self.attention_window = config.attention_window[layer_id]
+        self.attention_layer = LongformerSelfAttention(config, layer_id=layer_id)
+        self.attention = None
+
+    def forward(self, query, key, value, mask):
+        """Compute Longformer Self-Attention with masking.
+
+        Expects `len(hidden_states)` to be multiple of `attention_window`.
+        Padding to `attention_window` happens in :meth:`encoder.forward`
+        to avoid redoing the padding on each layer.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            pos_emb (torch.Tensor): Positional embedding tensor
+                (#batch, 2*time1-1, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2).
+
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+        """
+        attention_mask = mask.int()
+        attention_mask[mask == 0] = -1
+        attention_mask[mask == 1] = 0
+        output, self.attention = self.attention_layer(
+            hidden_states=query,
+            attention_mask=attention_mask.unsqueeze(1),
+            head_mask=None,
+            output_attentions=True,
+        )
+        return output
diff --git a/espnet2/asr/decoder/mlm_decoder.py b/espnet2/asr/decoder/mlm_decoder.py
new file mode 100644
index 00000000000..85cd1d3757f
--- /dev/null
+++ b/espnet2/asr/decoder/mlm_decoder.py
@@ -0,0 +1,130 @@
+# Copyright 2022 Yosuke Higuchi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Masked LM Decoder definition."""
+from typing import Tuple
+
+import torch
+from typeguard import check_argument_types
+
+from espnet.nets.pytorch_backend.nets_utils import make_pad_mask
+from espnet.nets.pytorch_backend.transformer.attention import MultiHeadedAttention
+from espnet.nets.pytorch_backend.transformer.decoder_layer import DecoderLayer
+from espnet.nets.pytorch_backend.transformer.embedding import PositionalEncoding
+from espnet.nets.pytorch_backend.transformer.layer_norm import LayerNorm
+from espnet.nets.pytorch_backend.transformer.positionwise_feed_forward import (
+    PositionwiseFeedForward,  # noqa: H301
+)
+from espnet.nets.pytorch_backend.transformer.repeat import repeat
+from espnet2.asr.decoder.abs_decoder import AbsDecoder
+
+
+class MLMDecoder(AbsDecoder):
+    def __init__(
+        self,
+        vocab_size: int,
+        encoder_output_size: int,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        self_attention_dropout_rate: float = 0.0,
+        src_attention_dropout_rate: float = 0.0,
+        input_layer: str = "embed",
+        use_output_layer: bool = True,
+        pos_enc_class=PositionalEncoding,
+        normalize_before: bool = True,
+        concat_after: bool = False,
+    ):
+        assert check_argument_types()
+        super().__init__()
+        attention_dim = encoder_output_size
+        vocab_size += 1  # for mask token
+
+        if input_layer == "embed":
+            self.embed = torch.nn.Sequential(
+                torch.nn.Embedding(vocab_size, attention_dim),
+                pos_enc_class(attention_dim, positional_dropout_rate),
+            )
+        elif input_layer == "linear":
+            self.embed = torch.nn.Sequential(
+                torch.nn.Linear(vocab_size, attention_dim),
+                torch.nn.LayerNorm(attention_dim),
+                torch.nn.Dropout(dropout_rate),
+                torch.nn.ReLU(),
+                pos_enc_class(attention_dim, positional_dropout_rate),
+            )
+        else:
+            raise ValueError(f"only 'embed' or 'linear' is supported: {input_layer}")
+
+        self.normalize_before = normalize_before
+        if self.normalize_before:
+            self.after_norm = LayerNorm(attention_dim)
+        if use_output_layer:
+            self.output_layer = torch.nn.Linear(attention_dim, vocab_size)
+        else:
+            self.output_layer = None
+
+        self.decoders = repeat(
+            num_blocks,
+            lambda lnum: DecoderLayer(
+                attention_dim,
+                MultiHeadedAttention(
+                    attention_heads, attention_dim, self_attention_dropout_rate
+                ),
+                MultiHeadedAttention(
+                    attention_heads, attention_dim, src_attention_dropout_rate
+                ),
+                PositionwiseFeedForward(attention_dim, linear_units, dropout_rate),
+                dropout_rate,
+                normalize_before,
+                concat_after,
+            ),
+        )
+
+    def forward(
+        self,
+        hs_pad: torch.Tensor,
+        hlens: torch.Tensor,
+        ys_in_pad: torch.Tensor,
+        ys_in_lens: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Forward decoder.
+
+        Args:
+            hs_pad: encoded memory, float32  (batch, maxlen_in, feat)
+            hlens: (batch)
+            ys_in_pad:
+                input token ids, int64 (batch, maxlen_out)
+                if input_layer == "embed"
+                input tensor (batch, maxlen_out, #mels) in the other cases
+            ys_in_lens: (batch)
+        Returns:
+            (tuple): tuple containing:
+            x: decoded token score before softmax (batch, maxlen_out, token)
+                if use_output_layer is True,
+            olens: (batch, )
+        """
+        tgt = ys_in_pad
+        # tgt_mask: (B, 1, L)
+        tgt_mask = (~make_pad_mask(ys_in_lens)[:, None, :]).to(tgt.device)
+        tgt_max_len = tgt_mask.size(-1)
+        # tgt_mask_tmp: (B, L, L)
+        tgt_mask_tmp = tgt_mask.transpose(1, 2).repeat(1, 1, tgt_max_len)
+        tgt_mask = tgt_mask.repeat(1, tgt_max_len, 1) & tgt_mask_tmp
+
+        memory = hs_pad
+        memory_mask = (~make_pad_mask(hlens))[:, None, :].to(memory.device)
+
+        x = self.embed(tgt)
+        x, tgt_mask, memory, memory_mask = self.decoders(
+            x, tgt_mask, memory, memory_mask
+        )
+        if self.normalize_before:
+            x = self.after_norm(x)
+        if self.output_layer is not None:
+            x = self.output_layer(x)
+
+        olens = tgt_mask.sum(1)
+        return x, olens
diff --git a/espnet2/asr/decoder/transformer_decoder.py b/espnet2/asr/decoder/transformer_decoder.py
index cc6d931a772..1bd74cb76c1 100644
--- a/espnet2/asr/decoder/transformer_decoder.py
+++ b/espnet2/asr/decoder/transformer_decoder.py
@@ -128,6 +128,12 @@ def forward(
         memory_mask = (~make_pad_mask(hlens, maxlen=memory.size(1)))[:, None, :].to(
             memory.device
         )
+        # Padding for Longformer
+        if memory_mask.shape[-1] != memory.shape[1]:
+            padlen = memory.shape[1] - memory_mask.shape[-1]
+            memory_mask = torch.nn.functional.pad(
+                memory_mask, (0, padlen), "constant", False
+            )
 
         x = self.embed(tgt)
         x, tgt_mask, memory, memory_mask = self.decoders(
diff --git a/espnet2/asr/encoder/conformer_encoder.py b/espnet2/asr/encoder/conformer_encoder.py
index 00f09070cda..c0c3d92fd1c 100644
--- a/espnet2/asr/encoder/conformer_encoder.py
+++ b/espnet2/asr/encoder/conformer_encoder.py
@@ -6,6 +6,7 @@
 from typing import List
 from typing import Optional
 from typing import Tuple
+from typing import Union
 
 import logging
 import torch
@@ -105,6 +106,7 @@ def __init__(
         padding_idx: int = -1,
         interctc_layer_idx: List[int] = [],
         interctc_use_conditioning: bool = False,
+        stochastic_depth_rate: Union[float, List[float]] = 0.0,
     ):
         assert check_argument_types()
         super().__init__()
@@ -250,6 +252,15 @@ def __init__(
         convolution_layer = ConvolutionModule
         convolution_layer_args = (output_size, cnn_module_kernel, activation)
 
+        if isinstance(stochastic_depth_rate, float):
+            stochastic_depth_rate = [stochastic_depth_rate] * num_blocks
+
+        if len(stochastic_depth_rate) != num_blocks:
+            raise ValueError(
+                f"Length of stochastic_depth_rate ({len(stochastic_depth_rate)}) "
+                f"should be equal to num_blocks ({num_blocks})"
+            )
+
         self.encoders = repeat(
             num_blocks,
             lambda lnum: EncoderLayer(
@@ -261,6 +272,7 @@ def __init__(
                 dropout_rate,
                 normalize_before,
                 concat_after,
+                stochastic_depth_rate[lnum],
             ),
         )
         if self.normalize_before:
diff --git a/espnet2/asr/encoder/longformer_encoder.py b/espnet2/asr/encoder/longformer_encoder.py
new file mode 100644
index 00000000000..1d9dcfcc864
--- /dev/null
+++ b/espnet2/asr/encoder/longformer_encoder.py
@@ -0,0 +1,374 @@
+# Copyright 2020 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Conformer encoder definition."""
+
+from typing import List
+from typing import Optional
+from typing import Tuple
+
+import torch
+
+from typeguard import check_argument_types
+
+from espnet.nets.pytorch_backend.conformer.convolution import ConvolutionModule
+from espnet.nets.pytorch_backend.conformer.encoder_layer import EncoderLayer
+from espnet.nets.pytorch_backend.nets_utils import get_activation
+from espnet.nets.pytorch_backend.nets_utils import make_pad_mask
+from espnet.nets.pytorch_backend.transformer.embedding import (
+    PositionalEncoding,  # noqa: H301
+)
+from espnet.nets.pytorch_backend.transformer.layer_norm import LayerNorm
+from espnet.nets.pytorch_backend.transformer.multi_layer_conv import Conv1dLinear
+from espnet.nets.pytorch_backend.transformer.multi_layer_conv import MultiLayeredConv1d
+from espnet.nets.pytorch_backend.transformer.positionwise_feed_forward import (
+    PositionwiseFeedForward,  # noqa: H301
+)
+from espnet.nets.pytorch_backend.transformer.repeat import repeat
+from espnet.nets.pytorch_backend.transformer.subsampling import check_short_utt
+from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling
+from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling2
+from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling6
+from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling8
+from espnet.nets.pytorch_backend.transformer.subsampling import TooShortUttError
+from espnet2.asr.ctc import CTC
+from espnet2.asr.encoder.conformer_encoder import ConformerEncoder
+
+
+class LongformerEncoder(ConformerEncoder):
+    """Longformer SA Conformer encoder module.
+
+    Args:
+        input_size (int): Input dimension.
+        output_size (int): Dimension of attention.
+        attention_heads (int): The number of heads of multi head attention.
+        linear_units (int): The number of units of position-wise feed forward.
+        num_blocks (int): The number of decoder blocks.
+        dropout_rate (float): Dropout rate.
+        attention_dropout_rate (float): Dropout rate in attention.
+        positional_dropout_rate (float): Dropout rate after adding positional encoding.
+        input_layer (Union[str, torch.nn.Module]): Input layer type.
+        normalize_before (bool): Whether to use layer_norm before the first block.
+        concat_after (bool): Whether to concat attention layer's input and output.
+            If True, additional linear will be applied.
+            i.e. x -> x + linear(concat(x, att(x)))
+            If False, no additional linear will be applied. i.e. x -> x + att(x)
+        positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
+        positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
+        rel_pos_type (str): Whether to use the latest relative positional encoding or
+            the legacy one. The legacy relative positional encoding will be deprecated
+            in the future. More Details can be found in
+            https://github.com/espnet/espnet/pull/2816.
+        encoder_pos_enc_layer_type (str): Encoder positional encoding layer type.
+        encoder_attn_layer_type (str): Encoder attention layer type.
+        activation_type (str): Encoder activation function type.
+        macaron_style (bool): Whether to use macaron style for positionwise layer.
+        use_cnn_module (bool): Whether to use convolution module.
+        zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
+        cnn_module_kernel (int): Kernerl size of convolution module.
+        padding_idx (int): Padding idx for input_layer=embed.
+        attention_windows (list): Layer-wise attention window sizes
+            for longformer self-attn
+        attention_dilation(list): Layer-wise attention dilation sizes
+            for longformer self-attn
+        attention_mode(str): Implementation for longformer self-attn.
+            Default="sliding_chunks"
+            Choose 'n2', 'tvm' or 'sliding_chunks'. More details in
+            https://github.com/allenai/longformer
+
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int = 256,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        attention_dropout_rate: float = 0.0,
+        input_layer: str = "conv2d",
+        normalize_before: bool = True,
+        concat_after: bool = False,
+        positionwise_layer_type: str = "linear",
+        positionwise_conv_kernel_size: int = 3,
+        macaron_style: bool = False,
+        rel_pos_type: str = "legacy",
+        pos_enc_layer_type: str = "abs_pos",
+        selfattention_layer_type: str = "lf_selfattn",
+        activation_type: str = "swish",
+        use_cnn_module: bool = True,
+        zero_triu: bool = False,
+        cnn_module_kernel: int = 31,
+        padding_idx: int = -1,
+        interctc_layer_idx: List[int] = [],
+        interctc_use_conditioning: bool = False,
+        attention_windows: list = [100, 100, 100, 100, 100, 100],
+        attention_dilation: list = [1, 1, 1, 1, 1, 1],
+        attention_mode: str = "sliding_chunks",
+    ):
+        assert check_argument_types()
+        super().__init__(input_size)
+        self._output_size = output_size
+
+        activation = get_activation(activation_type)
+
+        if pos_enc_layer_type == "abs_pos":
+            pos_enc_class = PositionalEncoding
+        else:
+            raise ValueError(
+                "incorrect or unknown pos_enc_layer: "
+                + pos_enc_layer_type
+                + "Use abs_pos"
+            )
+
+        if len(attention_dilation) != num_blocks:
+            raise ValueError(
+                "incorrect attention_dilation parameter of length"
+                + str(len(attention_dilation))
+                + " does not match num_blocks"
+                + str(num_blocks)
+            )
+
+        if len(attention_windows) != num_blocks:
+            raise ValueError(
+                "incorrect attention_windows parameter of length"
+                + str(len(attention_windows))
+                + " does not match num_blocks"
+                + str(num_blocks)
+            )
+
+        if attention_mode != "tvm" and max(attention_dilation) != 1:
+            raise ValueError(
+                "incorrect attention mode for dilation: "
+                + attention_mode
+                + "Use attention_mode=tvm with Cuda Kernel"
+            )
+
+        if input_layer == "linear":
+            self.embed = torch.nn.Sequential(
+                torch.nn.Linear(input_size, output_size),
+                torch.nn.LayerNorm(output_size),
+                torch.nn.Dropout(dropout_rate),
+                pos_enc_class(output_size, positional_dropout_rate),
+            )
+        elif input_layer == "conv2d":
+            self.embed = Conv2dSubsampling(
+                input_size,
+                output_size,
+                dropout_rate,
+                pos_enc_class(output_size, positional_dropout_rate),
+            )
+        elif input_layer == "conv2d2":
+            self.embed = Conv2dSubsampling2(
+                input_size,
+                output_size,
+                dropout_rate,
+                pos_enc_class(output_size, positional_dropout_rate),
+            )
+        elif input_layer == "conv2d6":
+            self.embed = Conv2dSubsampling6(
+                input_size,
+                output_size,
+                dropout_rate,
+                pos_enc_class(output_size, positional_dropout_rate),
+            )
+        elif input_layer == "conv2d8":
+            self.embed = Conv2dSubsampling8(
+                input_size,
+                output_size,
+                dropout_rate,
+                pos_enc_class(output_size, positional_dropout_rate),
+            )
+        elif input_layer == "embed":
+            self.embed = torch.nn.Sequential(
+                torch.nn.Embedding(input_size, output_size, padding_idx=padding_idx),
+                pos_enc_class(output_size, positional_dropout_rate),
+            )
+        elif isinstance(input_layer, torch.nn.Module):
+            self.embed = torch.nn.Sequential(
+                input_layer,
+                pos_enc_class(output_size, positional_dropout_rate),
+            )
+        elif input_layer is None:
+            self.embed = torch.nn.Sequential(
+                pos_enc_class(output_size, positional_dropout_rate)
+            )
+        else:
+            raise ValueError("unknown input_layer: " + input_layer)
+
+        self.normalize_before = normalize_before
+        if positionwise_layer_type == "linear":
+            positionwise_layer = PositionwiseFeedForward
+            positionwise_layer_args = (
+                output_size,
+                linear_units,
+                dropout_rate,
+                activation,
+            )
+        elif positionwise_layer_type == "conv1d":
+            positionwise_layer = MultiLayeredConv1d
+            positionwise_layer_args = (
+                output_size,
+                linear_units,
+                positionwise_conv_kernel_size,
+                dropout_rate,
+            )
+        elif positionwise_layer_type == "conv1d-linear":
+            positionwise_layer = Conv1dLinear
+            positionwise_layer_args = (
+                output_size,
+                linear_units,
+                positionwise_conv_kernel_size,
+                dropout_rate,
+            )
+        else:
+            raise NotImplementedError("Support only linear or conv1d.")
+        self.selfattention_layer_type = selfattention_layer_type
+        if selfattention_layer_type == "lf_selfattn":
+            assert pos_enc_layer_type == "abs_pos"
+            from espnet.nets.pytorch_backend.transformer.longformer_attention import (
+                LongformerAttention,  # noqa: H301
+            )
+            from longformer.longformer import LongformerConfig
+
+            encoder_selfattn_layer = LongformerAttention
+
+            config = LongformerConfig(
+                attention_window=attention_windows,
+                attention_dilation=attention_dilation,
+                autoregressive=False,
+                num_attention_heads=attention_heads,
+                hidden_size=output_size,
+                attention_probs_dropout_prob=dropout_rate,
+                attention_mode=attention_mode,
+            )
+            encoder_selfattn_layer_args = (config,)
+        else:
+            raise ValueError(
+                "incompatible or unknown encoder_attn_layer: "
+                + selfattention_layer_type
+                + " Use lf_selfattn"
+            )
+
+        convolution_layer = ConvolutionModule
+        convolution_layer_args = (output_size, cnn_module_kernel, activation)
+
+        self.encoders = repeat(
+            num_blocks,
+            lambda layer_id: EncoderLayer(
+                output_size,
+                encoder_selfattn_layer(*(encoder_selfattn_layer_args + (layer_id,))),
+                positionwise_layer(*positionwise_layer_args),
+                positionwise_layer(*positionwise_layer_args) if macaron_style else None,
+                convolution_layer(*convolution_layer_args) if use_cnn_module else None,
+                dropout_rate,
+                normalize_before,
+                concat_after,
+            ),
+        )
+
+        if self.normalize_before:
+            self.after_norm = LayerNorm(output_size)
+
+        self.interctc_layer_idx = interctc_layer_idx
+        if len(interctc_layer_idx) > 0:
+            assert 0 < min(interctc_layer_idx) and max(interctc_layer_idx) < num_blocks
+        self.interctc_use_conditioning = interctc_use_conditioning
+        self.conditioning_layer = None
+
+    def output_size(self) -> int:
+        return self._output_size
+
+    def forward(
+        self,
+        xs_pad: torch.Tensor,
+        ilens: torch.Tensor,
+        prev_states: torch.Tensor = None,
+        ctc: CTC = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        """Calculate forward propagation.
+
+        Args:
+            xs_pad (torch.Tensor): Input tensor (#batch, L, input_size).
+            ilens (torch.Tensor): Input length (#batch).
+            prev_states (torch.Tensor): Not to be used now.
+
+        Returns:
+            torch.Tensor: Output tensor (#batch, L, output_size).
+            torch.Tensor: Output length (#batch).
+            torch.Tensor: Not to be used now.
+
+        """
+
+        masks = (~make_pad_mask(ilens)[:, None, :]).to(xs_pad.device)
+        if (
+            isinstance(self.embed, Conv2dSubsampling)
+            or isinstance(self.embed, Conv2dSubsampling2)
+            or isinstance(self.embed, Conv2dSubsampling6)
+            or isinstance(self.embed, Conv2dSubsampling8)
+        ):
+            short_status, limit_size = check_short_utt(self.embed, xs_pad.size(1))
+            if short_status:
+                raise TooShortUttError(
+                    f"has {xs_pad.size(1)} frames and is too short for subsampling "
+                    + f"(it needs more than {limit_size} frames), return empty results",
+                    xs_pad.size(1),
+                    limit_size,
+                )
+            xs_pad, masks = self.embed(xs_pad, masks)
+        else:
+            xs_pad = self.embed(xs_pad)
+
+        if self.selfattention_layer_type == "lf_selfattn":
+            seq_len = xs_pad.shape[1]
+            attention_window = (
+                max([x.self_attn.attention_window for x in self.encoders]) * 2
+            )
+            padding_len = (
+                attention_window - seq_len % attention_window
+            ) % attention_window
+            xs_pad = torch.nn.functional.pad(
+                xs_pad, (0, 0, 0, padding_len), "constant", 0
+            )
+            masks = torch.nn.functional.pad(masks, (0, padding_len), "constant", False)
+
+        xs_pad, masks = self.encoders(xs_pad, masks)
+        intermediate_outs = []
+        if len(self.interctc_layer_idx) == 0:
+            xs_pad, masks = self.encoders(xs_pad, masks)
+        else:
+            for layer_idx, encoder_layer in enumerate(self.encoders):
+                xs_pad, masks = encoder_layer(xs_pad, masks)
+
+                if layer_idx + 1 in self.interctc_layer_idx:
+                    encoder_out = xs_pad
+                    if isinstance(encoder_out, tuple):
+                        encoder_out = encoder_out[0]
+
+                    # intermediate outputs are also normalized
+                    if self.normalize_before:
+                        encoder_out = self.after_norm(encoder_out)
+
+                    intermediate_outs.append((layer_idx + 1, encoder_out))
+
+                    if self.interctc_use_conditioning:
+                        ctc_out = ctc.softmax(encoder_out)
+
+                        if isinstance(xs_pad, tuple):
+                            x, pos_emb = xs_pad
+                            x = x + self.conditioning_layer(ctc_out)
+                            xs_pad = (x, pos_emb)
+                        else:
+                            xs_pad = xs_pad + self.conditioning_layer(ctc_out)
+
+        if isinstance(xs_pad, tuple):
+            xs_pad = xs_pad[0]
+        if self.normalize_before:
+            xs_pad = self.after_norm(xs_pad)
+
+        olens = masks.squeeze(1).sum(1)
+        if len(intermediate_outs) > 0:
+            return (xs_pad, intermediate_outs), olens, None
+        return xs_pad, olens, None
diff --git a/espnet2/asr/maskctc_model.py b/espnet2/asr/maskctc_model.py
new file mode 100644
index 00000000000..ab45c625606
--- /dev/null
+++ b/espnet2/asr/maskctc_model.py
@@ -0,0 +1,346 @@
+from contextlib import contextmanager
+from distutils.version import LooseVersion
+from itertools import groupby
+import logging
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import numpy
+import torch
+from typeguard import check_argument_types
+
+from espnet.nets.beam_search import Hypothesis
+from espnet.nets.e2e_asr_common import ErrorCalculator
+from espnet.nets.pytorch_backend.maskctc.add_mask_token import mask_uniform
+from espnet.nets.pytorch_backend.nets_utils import th_accuracy
+from espnet.nets.pytorch_backend.transformer.label_smoothing_loss import (
+    LabelSmoothingLoss,  # noqa: H301
+)
+from espnet2.asr.ctc import CTC
+from espnet2.asr.decoder.mlm_decoder import MLMDecoder
+from espnet2.asr.encoder.abs_encoder import AbsEncoder
+from espnet2.asr.espnet_model import ESPnetASRModel
+from espnet2.asr.frontend.abs_frontend import AbsFrontend
+from espnet2.asr.postencoder.abs_postencoder import AbsPostEncoder
+from espnet2.asr.preencoder.abs_preencoder import AbsPreEncoder
+from espnet2.asr.specaug.abs_specaug import AbsSpecAug
+from espnet2.layers.abs_normalize import AbsNormalize
+from espnet2.text.token_id_converter import TokenIDConverter
+from espnet2.torch_utils.device_funcs import force_gatherable
+
+if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"):
+    from torch.cuda.amp import autocast
+else:
+    # Nothing to do if torch<1.6.0
+    @contextmanager
+    def autocast(enabled=True):
+        yield
+
+
+class MaskCTCModel(ESPnetASRModel):
+    """Hybrid CTC/Masked LM Encoder-Decoder model (Mask-CTC)"""
+
+    def __init__(
+        self,
+        vocab_size: int,
+        token_list: Union[Tuple[str, ...], List[str]],
+        frontend: Optional[AbsFrontend],
+        specaug: Optional[AbsSpecAug],
+        normalize: Optional[AbsNormalize],
+        preencoder: Optional[AbsPreEncoder],
+        encoder: AbsEncoder,
+        postencoder: Optional[AbsPostEncoder],
+        decoder: MLMDecoder,
+        ctc: CTC,
+        joint_network: Optional[torch.nn.Module] = None,
+        ctc_weight: float = 0.5,
+        interctc_weight: float = 0.0,
+        ignore_id: int = -1,
+        lsm_weight: float = 0.0,
+        length_normalized_loss: bool = False,
+        report_cer: bool = True,
+        report_wer: bool = True,
+        sym_space: str = "<space>",
+        sym_blank: str = "<blank>",
+        sym_mask: str = "<mask>",
+        extract_feats_in_collect_stats: bool = True,
+    ):
+        assert check_argument_types()
+
+        super().__init__(
+            vocab_size=vocab_size,
+            token_list=token_list,
+            frontend=frontend,
+            specaug=specaug,
+            normalize=normalize,
+            preencoder=preencoder,
+            encoder=encoder,
+            postencoder=postencoder,
+            decoder=decoder,
+            ctc=ctc,
+            joint_network=joint_network,
+            ctc_weight=ctc_weight,
+            interctc_weight=interctc_weight,
+            ignore_id=ignore_id,
+            lsm_weight=lsm_weight,
+            length_normalized_loss=length_normalized_loss,
+            report_cer=report_cer,
+            report_wer=report_wer,
+            sym_space=sym_space,
+            sym_blank=sym_blank,
+            extract_feats_in_collect_stats=extract_feats_in_collect_stats,
+        )
+
+        # Add <mask> and override inherited fields
+        token_list.append(sym_mask)
+        vocab_size += 1
+        self.vocab_size = vocab_size
+        self.mask_token = vocab_size - 1
+        self.token_list = token_list.copy()
+
+        # MLM loss
+        del self.criterion_att
+        self.criterion_mlm = LabelSmoothingLoss(
+            size=vocab_size,
+            padding_idx=ignore_id,
+            smoothing=lsm_weight,
+            normalize_length=length_normalized_loss,
+        )
+
+        self.error_calculator = None
+        if report_cer or report_wer:
+            self.error_calculator = ErrorCalculator(
+                token_list, sym_space, sym_blank, report_cer, report_wer
+            )
+
+    def forward(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
+        """Frontend + Encoder + Decoder + Calc loss
+
+        Args:
+            speech: (Batch, Length, ...)
+            speech_lengths: (Batch, )
+            text: (Batch, Length)
+            text_lengths: (Batch,)
+        """
+        assert text_lengths.dim() == 1, text_lengths.shape
+        # Check that batch_size is unified
+        assert (
+            speech.shape[0]
+            == speech_lengths.shape[0]
+            == text.shape[0]
+            == text_lengths.shape[0]
+        ), (speech.shape, speech_lengths.shape, text.shape, text_lengths.shape)
+        batch_size = speech.shape[0]
+
+        # For data-parallel
+        text = text[:, : text_lengths.max()]
+
+        # Define stats to report
+        loss_mlm, acc_mlm = None, None
+        loss_ctc, cer_ctc = None, None
+        stats = dict()
+
+        # 1. Encoder
+        encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
+        intermediate_outs = None
+        if isinstance(encoder_out, tuple):
+            intermediate_outs = encoder_out[1]
+            encoder_out = encoder_out[0]
+
+        # 2. CTC branch
+        if self.ctc_weight != 0.0:
+            loss_ctc, cer_ctc = self._calc_ctc_loss(
+                encoder_out, encoder_out_lens, text, text_lengths
+            )
+
+            # Collect CTC branch stats
+            stats["loss_ctc"] = loss_ctc.detach() if loss_ctc is not None else None
+            stats["cer_ctc"] = cer_ctc
+
+        # 2a. Intermediate CTC (optional)
+        loss_interctc = 0.0
+        if self.interctc_weight != 0.0 and intermediate_outs is not None:
+            for layer_idx, intermediate_out in intermediate_outs:
+                # we assume intermediate_out has the same length & padding
+                # as those of encoder_out
+                loss_ic, cer_ic = self._calc_ctc_loss(
+                    intermediate_out, encoder_out_lens, text, text_lengths
+                )
+                loss_interctc = loss_interctc + loss_ic
+
+                # Collect Intermedaite CTC stats
+                stats["loss_interctc_layer{}".format(layer_idx)] = (
+                    loss_ic.detach() if loss_ic is not None else None
+                )
+                stats["cer_interctc_layer{}".format(layer_idx)] = cer_ic
+
+            loss_interctc = loss_interctc / len(intermediate_outs)
+
+            # calculate whole encoder loss
+            loss_ctc = (
+                1 - self.interctc_weight
+            ) * loss_ctc + self.interctc_weight * loss_interctc
+
+        # 3. MLM decoder branch
+        if self.ctc_weight != 1.0:
+            loss_mlm, acc_mlm = self._calc_mlm_loss(
+                encoder_out, encoder_out_lens, text, text_lengths
+            )
+
+        # 4. CTC/MLM loss definition
+        if self.ctc_weight == 0.0:
+            loss = loss_mlm
+        elif self.ctc_weight == 1.0:
+            loss = loss_ctc
+        else:
+            loss = self.ctc_weight * loss_ctc + (1 - self.ctc_weight) * loss_mlm
+
+        # Collect MLM branch stats
+        stats["loss_mlm"] = loss_mlm.detach() if loss_mlm is not None else None
+        stats["acc_mlm"] = acc_mlm
+
+        # Collect total loss stats
+        stats["loss"] = loss.detach()
+
+        # force_gatherable: to-device and to-tensor if scalar for DataParallel
+        loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
+        return loss, stats, weight
+
+    def _calc_mlm_loss(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_out_lens: torch.Tensor,
+        ys_pad: torch.Tensor,
+        ys_pad_lens: torch.Tensor,
+    ):
+        # 1. Apply masks
+        ys_in_pad, ys_out_pad = mask_uniform(
+            ys_pad, self.mask_token, self.eos, self.ignore_id
+        )
+
+        # 2. Forward decoder
+        decoder_out, _ = self.decoder(
+            encoder_out, encoder_out_lens, ys_in_pad, ys_pad_lens
+        )
+
+        # 3. Compute mlm loss
+        loss_mlm = self.criterion_mlm(decoder_out, ys_out_pad)
+        acc_mlm = th_accuracy(
+            decoder_out.view(-1, self.vocab_size),
+            ys_out_pad,
+            ignore_label=self.ignore_id,
+        )
+
+        return loss_mlm, acc_mlm
+
+    def nll(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_out_lens: torch.Tensor,
+        ys_pad: torch.Tensor,
+        ys_pad_lens: torch.Tensor,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def batchify_nll(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_out_lens: torch.Tensor,
+        ys_pad: torch.Tensor,
+        ys_pad_lens: torch.Tensor,
+        batch_size: int = 100,
+    ):
+        raise NotImplementedError
+
+
+class MaskCTCInference(torch.nn.Module):
+    """Mask-CTC-based non-autoregressive inference"""
+
+    def __init__(
+        self,
+        asr_model: MaskCTCModel,
+        n_iterations: int,
+        threshold_probability: float,
+    ):
+        """Initialize Mask-CTC inference"""
+        super().__init__()
+        self.ctc = asr_model.ctc
+        self.mlm = asr_model.decoder
+        self.mask_token = asr_model.mask_token
+        self.n_iterations = n_iterations
+        self.threshold_probability = threshold_probability
+        self.converter = TokenIDConverter(token_list=asr_model.token_list)
+
+    def ids2text(self, ids: List[int]):
+        text = "".join(self.converter.ids2tokens(ids))
+        return text.replace("<mask>", "_").replace("<space>", " ")
+
+    def forward(self, enc_out: torch.Tensor) -> List[Hypothesis]:
+        """Perform Mask-CTC inference"""
+        # greedy ctc outputs
+        enc_out = enc_out.unsqueeze(0)
+        ctc_probs, ctc_ids = torch.exp(self.ctc.log_softmax(enc_out)).max(dim=-1)
+        y_hat = torch.stack([x[0] for x in groupby(ctc_ids[0])])
+        y_idx = torch.nonzero(y_hat != 0).squeeze(-1)
+
+        logging.info("ctc:{}".format(self.ids2text(y_hat[y_idx].tolist())))
+
+        # calculate token-level ctc probabilities by taking
+        # the maximum probability of consecutive frames with
+        # the same ctc symbols
+        probs_hat = []
+        cnt = 0
+        for i, y in enumerate(y_hat.tolist()):
+            probs_hat.append(-1)
+            while cnt < ctc_ids.shape[1] and y == ctc_ids[0][cnt]:
+                if probs_hat[i] < ctc_probs[0][cnt]:
+                    probs_hat[i] = ctc_probs[0][cnt].item()
+                cnt += 1
+        probs_hat = torch.from_numpy(numpy.array(probs_hat))
+
+        # mask ctc outputs based on ctc probabilities
+        p_thres = self.threshold_probability
+        mask_idx = torch.nonzero(probs_hat[y_idx] < p_thres).squeeze(-1)
+        confident_idx = torch.nonzero(probs_hat[y_idx] >= p_thres).squeeze(-1)
+        mask_num = len(mask_idx)
+
+        y_in = torch.zeros(1, len(y_idx), dtype=torch.long) + self.mask_token
+        y_in[0][confident_idx] = y_hat[y_idx][confident_idx]
+
+        logging.info("msk:{}".format(self.ids2text(y_in[0].tolist())))
+
+        # iterative decoding
+        if not mask_num == 0:
+            K = self.n_iterations
+            num_iter = K if mask_num >= K and K > 0 else mask_num
+
+            for t in range(num_iter - 1):
+                pred, _ = self.mlm(enc_out, [enc_out.size(1)], y_in, [y_in.size(1)])
+                pred_score, pred_id = pred[0][mask_idx].max(dim=-1)
+                cand = torch.topk(pred_score, mask_num // num_iter, -1)[1]
+                y_in[0][mask_idx[cand]] = pred_id[cand]
+                mask_idx = torch.nonzero(y_in[0] == self.mask_token).squeeze(-1)
+
+                logging.info("msk:{}".format(self.ids2text(y_in[0].tolist())))
+
+            # predict leftover masks (|masks| < mask_num // num_iter)
+            pred, _ = self.mlm(enc_out, [enc_out.size(1)], y_in, [y_in.size(1)])
+            y_in[0][mask_idx] = pred[0][mask_idx].argmax(dim=-1)
+
+            logging.info("msk:{}".format(self.ids2text(y_in[0].tolist())))
+
+        # pad with mask tokens to ensure compatibility with sos/eos tokens
+        yseq = torch.tensor(
+            [self.mask_token] + y_in.tolist()[0] + [self.mask_token], device=y_in.device
+        )
+
+        return Hypothesis(yseq=yseq)
diff --git a/espnet2/bin/asr_inference.py b/espnet2/bin/asr_inference.py
index fbb156d8229..d5a7932b0e6 100755
--- a/espnet2/bin/asr_inference.py
+++ b/espnet2/bin/asr_inference.py
@@ -136,6 +136,7 @@ def __init__(
                 decoder=1.0 - ctc_weight,
                 ctc=ctc_weight,
                 lm=lm_weight,
+                ngram=ngram_weight,
                 length_bonus=penalty,
             )
             beam_search = BeamSearch(
diff --git a/espnet2/bin/asr_inference_maskctc.py b/espnet2/bin/asr_inference_maskctc.py
new file mode 100644
index 00000000000..20b857482f1
--- /dev/null
+++ b/espnet2/bin/asr_inference_maskctc.py
@@ -0,0 +1,384 @@
+#!/usr/bin/env python3
+import argparse
+import logging
+from pathlib import Path
+import sys
+from typing import Any
+from typing import Optional
+from typing import Sequence
+from typing import Tuple
+from typing import Union
+
+import numpy as np
+import torch
+from typeguard import check_argument_types
+from typeguard import check_return_type
+from typing import List
+
+from espnet.nets.beam_search import Hypothesis
+from espnet.nets.pytorch_backend.transformer.subsampling import TooShortUttError
+from espnet.utils.cli_utils import get_commandline_args
+from espnet2.asr.maskctc_model import MaskCTCInference
+from espnet2.fileio.datadir_writer import DatadirWriter
+from espnet2.tasks.asr import ASRTask
+from espnet2.text.build_tokenizer import build_tokenizer
+from espnet2.text.token_id_converter import TokenIDConverter
+from espnet2.torch_utils.device_funcs import to_device
+from espnet2.torch_utils.set_all_random_seed import set_all_random_seed
+from espnet2.utils import config_argparse
+from espnet2.utils.types import str2bool
+from espnet2.utils.types import str2triple_str
+from espnet2.utils.types import str_or_none
+
+
+class Speech2Text:
+    """Speech2Text class
+
+    Examples:
+        >>> import soundfile
+        >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
+        >>> audio, rate = soundfile.read("speech.wav")
+        >>> speech2text(audio)
+        [(text, token, token_int, hypothesis object), ...]
+
+    """
+
+    def __init__(
+        self,
+        asr_train_config: Union[Path, str],
+        asr_model_file: Union[Path, str] = None,
+        token_type: str = None,
+        bpemodel: str = None,
+        device: str = "cpu",
+        batch_size: int = 1,
+        dtype: str = "float32",
+        maskctc_n_iterations: int = 10,
+        maskctc_threshold_probability: float = 0.99,
+    ):
+        assert check_argument_types()
+
+        # 1. Build ASR model
+        asr_model, asr_train_args = ASRTask.build_model_from_file(
+            asr_train_config, asr_model_file, device
+        )
+        asr_model.to(dtype=getattr(torch, dtype)).eval()
+        token_list = asr_model.token_list
+
+        s2t = MaskCTCInference(
+            asr_model=asr_model,
+            n_iterations=maskctc_n_iterations,
+            threshold_probability=maskctc_threshold_probability,
+        )
+        s2t.to(device=device, dtype=getattr(torch, dtype)).eval()
+
+        # 2. [Optional] Build Text converter: e.g. bpe-sym -> Text
+        if token_type is None:
+            token_type = asr_train_args.token_type
+        if bpemodel is None:
+            bpemodel = asr_train_args.bpemodel
+
+        if token_type is None:
+            tokenizer = None
+        elif token_type == "bpe":
+            if bpemodel is not None:
+                tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel)
+            else:
+                tokenizer = None
+        else:
+            tokenizer = build_tokenizer(token_type=token_type)
+        converter = TokenIDConverter(token_list=token_list)
+        logging.info(f"Text tokenizer: {tokenizer}")
+
+        self.asr_model = asr_model
+        self.asr_train_args = asr_train_args
+        self.s2t = s2t
+        self.converter = converter
+        self.tokenizer = tokenizer
+        self.device = device
+        self.dtype = dtype
+
+    @torch.no_grad()
+    def __call__(
+        self, speech: Union[torch.Tensor, np.ndarray]
+    ) -> List[Tuple[Optional[str], List[str], List[int], Hypothesis]]:
+        """Inference
+
+        Args:
+            data: Input speech data
+        Returns:
+            text, token, token_int, hyp
+
+        """
+        assert check_argument_types()
+
+        # Input as audio signal
+        if isinstance(speech, np.ndarray):
+            speech = torch.tensor(speech)
+
+        # data: (Nsamples,) -> (1, Nsamples)
+        speech = speech.unsqueeze(0).to(getattr(torch, self.dtype))
+        # lenghts: (1,)
+        lengths = speech.new_full([1], dtype=torch.long, fill_value=speech.size(1))
+        batch = {"speech": speech, "speech_lengths": lengths}
+
+        # a. To device
+        batch = to_device(batch, device=self.device)
+
+        # b. Forward Encoder
+        enc, _ = self.asr_model.encode(**batch)
+        if isinstance(enc, tuple):
+            enc = enc[0]
+        assert len(enc) == 1, len(enc)
+
+        # c. Passed the encoder result and the inference algorithm
+        hyp = self.s2t(enc[0])
+        assert isinstance(hyp, Hypothesis), type(hyp)
+
+        # remove sos/eos and get results
+        token_int = hyp.yseq[1:-1].tolist()
+
+        # remove blank symbol id, which is assumed to be 0
+        token_int = list(filter(lambda x: x != 0, token_int))
+
+        # Change integer-ids to tokens
+        token = self.converter.ids2tokens(token_int)
+
+        if self.tokenizer is not None:
+            text = self.tokenizer.tokens2text(token)
+        else:
+            text = None
+        results = [(text, token, token_int, hyp)]
+
+        assert check_return_type(results)
+        return results
+
+    @staticmethod
+    def from_pretrained(
+        model_tag: Optional[str] = None,
+        **kwargs: Optional[Any],
+    ):
+        """Build Speech2Text instance from the pretrained model.
+
+        Args:
+            model_tag (Optional[str]): Model tag of the pretrained models.
+                Currently, the tags of espnet_model_zoo are supported.
+
+        Returns:
+            Speech2Text: Speech2Text instance.
+
+        """
+        if model_tag is not None:
+            try:
+                from espnet_model_zoo.downloader import ModelDownloader
+
+            except ImportError:
+                logging.error(
+                    "`espnet_model_zoo` is not installed. "
+                    "Please install via `pip install -U espnet_model_zoo`."
+                )
+                raise
+            d = ModelDownloader()
+            kwargs.update(**d.download_and_unpack(model_tag))
+
+        return Speech2Text(**kwargs)
+
+
+def inference(
+    output_dir: str,
+    batch_size: int,
+    dtype: str,
+    ngpu: int,
+    seed: int,
+    num_workers: int,
+    log_level: Union[int, str],
+    data_path_and_name_and_type: Sequence[Tuple[str, str, str]],
+    key_file: Optional[str],
+    asr_train_config: str,
+    asr_model_file: str,
+    model_tag: Optional[str],
+    token_type: Optional[str],
+    bpemodel: Optional[str],
+    allow_variable_data_keys: bool,
+    maskctc_n_iterations: int,
+    maskctc_threshold_probability: float,
+):
+    assert check_argument_types()
+    if batch_size > 1:
+        raise NotImplementedError("batch decoding is not implemented")
+    if ngpu > 1:
+        raise NotImplementedError("only single GPU decoding is supported")
+
+    logging.basicConfig(
+        level=log_level,
+        format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+    )
+
+    if ngpu >= 1:
+        device = "cuda"
+    else:
+        device = "cpu"
+
+    # 1. Set random-seed
+    set_all_random_seed(seed)
+
+    # 2. Build speech2text
+    speech2text_kwargs = dict(
+        asr_train_config=asr_train_config,
+        asr_model_file=asr_model_file,
+        token_type=token_type,
+        bpemodel=bpemodel,
+        device=device,
+        batch_size=batch_size,
+        dtype=dtype,
+        maskctc_n_iterations=maskctc_n_iterations,
+        maskctc_threshold_probability=maskctc_threshold_probability,
+    )
+    speech2text = Speech2Text.from_pretrained(
+        model_tag=model_tag,
+        **speech2text_kwargs,
+    )
+
+    # 3. Build data-iterator
+    loader = ASRTask.build_streaming_iterator(
+        data_path_and_name_and_type,
+        dtype=dtype,
+        batch_size=batch_size,
+        key_file=key_file,
+        num_workers=num_workers,
+        preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False),
+        collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args, False),
+        allow_variable_data_keys=allow_variable_data_keys,
+        inference=True,
+    )
+
+    # 7 .Start for-loop
+    with DatadirWriter(output_dir) as writer:
+        for keys, batch in loader:
+            assert isinstance(batch, dict), type(batch)
+            assert all(isinstance(s, str) for s in keys), keys
+            _bs = len(next(iter(batch.values())))
+            assert len(keys) == _bs, f"{len(keys)} != {_bs}"
+            batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}
+
+            try:
+                results = speech2text(**batch)
+            except TooShortUttError as e:
+                logging.warning(f"Utterance {keys} {e}")
+                hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
+                results = [[" ", ["<space>"], [2], hyp]]
+
+            # Only supporting batch_size==1
+            key = keys[0]
+            (text, token, token_int, hyp) = results[0]
+
+            # Create a directory: outdir/{n}best_recog
+            ibest_writer = writer["1best_recog"]
+
+            # Write the result to each file
+            ibest_writer["token"][key] = " ".join(token)
+            ibest_writer["token_int"][key] = " ".join(map(str, token_int))
+            ibest_writer["score"][key] = str(hyp.score)
+
+            if text is not None:
+                ibest_writer["text"][key] = text
+
+
+def get_parser():
+    parser = config_argparse.ArgumentParser(
+        description="ASR Decoding",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    # Note(kamo): Use '_' instead of '-' as separator.
+    # '-' is confusing if written in yaml.
+    parser.add_argument(
+        "--log_level",
+        type=lambda x: x.upper(),
+        default="INFO",
+        choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
+        help="The verbose level of logging",
+    )
+
+    parser.add_argument("--output_dir", type=str, required=True)
+    parser.add_argument(
+        "--ngpu",
+        type=int,
+        default=0,
+        help="The number of gpus. 0 indicates CPU mode",
+    )
+    parser.add_argument("--seed", type=int, default=0, help="Random seed")
+    parser.add_argument(
+        "--dtype",
+        default="float32",
+        choices=["float16", "float32", "float64"],
+        help="Data type",
+    )
+    parser.add_argument(
+        "--num_workers",
+        type=int,
+        default=1,
+        help="The number of workers used for DataLoader",
+    )
+
+    group = parser.add_argument_group("Input data related")
+    group.add_argument(
+        "--data_path_and_name_and_type",
+        type=str2triple_str,
+        required=True,
+        action="append",
+    )
+    group.add_argument("--key_file", type=str_or_none)
+    group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
+
+    group = parser.add_argument_group("The model configuration related")
+    group.add_argument("--asr_train_config", type=str, required=True)
+    group.add_argument("--asr_model_file", type=str, required=True)
+    group.add_argument(
+        "--model_tag",
+        type=str,
+        help="Pretrained model tag. If specify this option, *_train_config and "
+        "*_file will be overwritten",
+    )
+
+    group = parser.add_argument_group("Decoding related")
+    group.add_argument(
+        "--batch_size",
+        type=int,
+        default=1,
+        help="The batch size for inference",
+    )
+    group.add_argument("--maskctc_n_iterations", type=int, default=10)
+    group.add_argument("--maskctc_threshold_probability", type=float, default=0.99)
+
+    group = parser.add_argument_group("Text converter related")
+    group.add_argument(
+        "--token_type",
+        type=str_or_none,
+        default=None,
+        choices=["char", "bpe", None],
+        help="The token type for ASR model. "
+        "If not given, refers from the training args",
+    )
+    group.add_argument(
+        "--bpemodel",
+        type=str_or_none,
+        default=None,
+        help="The model path of sentencepiece. "
+        "If not given, refers from the training args",
+    )
+
+    return parser
+
+
+def main(cmd=None):
+    print(get_commandline_args(), file=sys.stderr)
+    parser = get_parser()
+    args = parser.parse_args(cmd)
+    kwargs = vars(args)
+    kwargs.pop("config", None)
+
+    inference(**kwargs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/espnet2/bin/mt_inference.py b/espnet2/bin/mt_inference.py
index 6ce966679a3..e523e1e6d47 100755
--- a/espnet2/bin/mt_inference.py
+++ b/espnet2/bin/mt_inference.py
@@ -213,7 +213,11 @@ def __call__(
             assert isinstance(hyp, Hypothesis), type(hyp)
 
             # remove sos/eos and get results
-            token_int = hyp.yseq[1:-1].tolist()
+            # token_int = hyp.yseq[1:-1].tolist()
+            # TODO(sdalmia): check why the above line doesn't work
+            token_int = hyp.yseq.tolist()
+            token_int = list(filter(lambda x: x != self.mt_model.sos, token_int))
+            token_int = list(filter(lambda x: x != self.mt_model.eos, token_int))
 
             # remove blank symbol id, which is assumed to be 0
             token_int = list(filter(lambda x: x != 0, token_int))
diff --git a/espnet2/enh/espnet_model.py b/espnet2/enh/espnet_model.py
index c35cec50393..d8cd99c8665 100644
--- a/espnet2/enh/espnet_model.py
+++ b/espnet2/enh/espnet_model.py
@@ -58,7 +58,7 @@ def __init__(
         # while enforcing STFT consistency (deprecated, keep for compatibility)
         self.stft_consistency = stft_consistency
 
-        # for multi-channel signal (deprecated, keep for compatibility)
+        # for multi-channel signal
         self.ref_channel = getattr(self.separator, "ref_channel", -1)
 
     def forward(
diff --git a/espnet2/enh/layers/conv_utils.py b/espnet2/enh/layers/conv_utils.py
new file mode 100644
index 00000000000..e3ca44083a6
--- /dev/null
+++ b/espnet2/enh/layers/conv_utils.py
@@ -0,0 +1,57 @@
+# noqa: E501 ported from https://discuss.pytorch.org/t/utility-function-for-calculating-the-shape-of-a-conv-output/11173/7
+import math
+
+
+def num2tuple(num):
+    return num if isinstance(num, tuple) else (num, num)
+
+
+def conv2d_output_shape(h_w, kernel_size=1, stride=1, pad=0, dilation=1):
+    h_w, kernel_size, stride, pad, dilation = (
+        num2tuple(h_w),
+        num2tuple(kernel_size),
+        num2tuple(stride),
+        num2tuple(pad),
+        num2tuple(dilation),
+    )
+    pad = num2tuple(pad[0]), num2tuple(pad[1])
+
+    h = math.floor(
+        (h_w[0] + sum(pad[0]) - dilation[0] * (kernel_size[0] - 1) - 1) / stride[0] + 1
+    )
+    w = math.floor(
+        (h_w[1] + sum(pad[1]) - dilation[1] * (kernel_size[1] - 1) - 1) / stride[1] + 1
+    )
+
+    return h, w
+
+
+def convtransp2d_output_shape(
+    h_w, kernel_size=1, stride=1, pad=0, dilation=1, out_pad=0
+):
+    h_w, kernel_size, stride, pad, dilation, out_pad = (
+        num2tuple(h_w),
+        num2tuple(kernel_size),
+        num2tuple(stride),
+        num2tuple(pad),
+        num2tuple(dilation),
+        num2tuple(out_pad),
+    )
+    pad = num2tuple(pad[0]), num2tuple(pad[1])
+
+    h = (
+        (h_w[0] - 1) * stride[0]
+        - sum(pad[0])
+        + dilation[0] * (kernel_size[0] - 1)
+        + out_pad[0]
+        + 1
+    )
+    w = (
+        (h_w[1] - 1) * stride[1]
+        - sum(pad[1])
+        + dilation[1] * (kernel_size[1] - 1)
+        + out_pad[1]
+        + 1
+    )
+
+    return h, w
diff --git a/espnet2/enh/layers/dc_crn.py b/espnet2/enh/layers/dc_crn.py
new file mode 100644
index 00000000000..ba781a4cd45
--- /dev/null
+++ b/espnet2/enh/layers/dc_crn.py
@@ -0,0 +1,508 @@
+# Implementation of Densely-connected convolutional recurrent network (DC-CRN)
+# [1] Tan et al. "Deep Learning Based Real-Time Speech Enhancement for Dual-Microphone
+#     Mobile Phones"
+#     https://web.cse.ohio-state.edu/~wang.77/papers/TZW.taslp21.pdf
+
+
+from typing import List
+
+import torch
+import torch.nn as nn
+
+from espnet2.enh.layers.conv_utils import conv2d_output_shape
+from espnet2.enh.layers.conv_utils import convtransp2d_output_shape
+
+
+class GLSTM(nn.Module):
+    def __init__(
+        self, hidden_size=1024, groups=2, layers=2, bidirectional=False, rearrange=False
+    ):
+        """Grouped LSTM.
+
+        Reference:
+            Efficient Sequence Learning with Group Recurrent Networks; Gao et al., 2018
+
+        Args:
+            hidden_size (int): total hidden size of all LSTMs in each grouped LSTM layer
+                i.e., hidden size of each LSTM is `hidden_size // groups`
+            groups (int): number of LSTMs in each grouped LSTM layer
+            layers (int): number of grouped LSTM layers
+            bidirectional (bool): whether to use BLSTM or unidirectional LSTM
+            rearrange (bool): whether to apply the rearrange operation after each
+                grouped LSTM layer
+        """
+        super().__init__()
+
+        assert hidden_size % groups == 0, (hidden_size, groups)
+        hidden_size_t = hidden_size // groups
+        if bidirectional:
+            assert hidden_size_t % 2 == 0, hidden_size_t
+
+        self.groups = groups
+        self.layers = layers
+        self.rearrange = rearrange
+
+        self.lstm_list = nn.ModuleList()
+        self.ln = nn.ModuleList([nn.LayerNorm(hidden_size) for _ in range(layers)])
+        for layer in range(layers):
+            self.lstm_list.append(
+                nn.ModuleList(
+                    [
+                        nn.LSTM(
+                            hidden_size_t,
+                            hidden_size_t // 2 if bidirectional else hidden_size_t,
+                            1,
+                            batch_first=True,
+                            bidirectional=bidirectional,
+                        )
+                        for _ in range(groups)
+                    ]
+                )
+            )
+
+    def forward(self, x):
+        """Grouped LSTM forward.
+
+        Args:
+            x (torch.Tensor): (B, C, T, D)
+        Returns:
+            out (torch.Tensor): (B, C, T, D)
+        """
+        out = x
+        out = out.transpose(1, 2).contiguous()
+        B, T = out.size(0), out.size(1)
+        out = out.view(B, T, -1).contiguous()
+
+        out = torch.chunk(out, self.groups, dim=-1)
+        out = torch.stack(
+            [self.lstm_list[0][i](out[i])[0] for i in range(self.groups)], dim=-1
+        )
+        out = torch.flatten(out, start_dim=-2, end_dim=-1)
+        out = self.ln[0](out)
+
+        for layer in range(1, self.layers):
+            if self.rearrange:
+                out = (
+                    out.reshape(B, T, self.groups, -1)
+                    .transpose(-1, -2)
+                    .contiguous()
+                    .view(B, T, -1)
+                )
+            out = torch.chunk(out, self.groups, dim=-1)
+            out = torch.cat(
+                [self.lstm_list[layer][i](out[i])[0] for i in range(self.groups)],
+                dim=-1,
+            )
+            out = self.ln[layer](out)
+
+        out = out.view(out.size(0), out.size(1), x.size(1), -1).contiguous()
+        out = out.transpose(1, 2).contiguous()
+
+        return out
+
+
+class GluConv2d(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride, padding=0):
+        """Conv2d with Gated Linear Units (GLU).
+
+        Input and output shapes are the same as regular Conv2d layers.
+
+        Reference: Section III-B in [1]
+
+        Args:
+            in_channels (int): number of input channels
+            out_channels (int): number of output channels
+            kernel_size (int/tuple): kernel size in Conv2d
+            stride (int/tuple): stride size in Conv2d
+            padding (int/tuple): padding size in Conv2d
+        """
+        super().__init__()
+        self.conv1 = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+        )
+        self.conv2 = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+        )
+
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, x):
+        """ConvGLU forward.
+
+        Args:
+            x (torch.Tensor): (B, C_in, H_in, W_in)
+        Returns:
+            out (torch.Tensor): (B, C_out, H_out, W_out)
+        """
+        out = self.conv1(x)
+        gate = self.sigmoid(self.conv2(x))
+        return out * gate
+
+
+class GluConvTranspose2d(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding=0,
+        output_padding=(0, 0),
+    ):
+        """ConvTranspose2d with Gated Linear Units (GLU).
+
+        Input and output shapes are the same as regular ConvTranspose2d layers.
+
+        Reference: Section III-B in [1]
+
+        Args:
+            in_channels (int): number of input channels
+            out_channels (int): number of output channels
+            kernel_size (int/tuple): kernel size in ConvTranspose2d
+            stride (int/tuple): stride size in ConvTranspose2d
+            padding (int/tuple): padding size in ConvTranspose2d
+            output_padding (int/tuple): Additional size added to one side of each
+                dimension in the output shape
+        """
+        super().__init__()
+        self.deconv1 = nn.ConvTranspose2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+        )
+        self.deconv2 = nn.ConvTranspose2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+        )
+
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, x):
+        """DeconvGLU forward.
+
+        Args:
+            x (torch.Tensor): (B, C_in, H_in, W_in)
+        Returns:
+            out (torch.Tensor): (B, C_out, H_out, W_out)
+        """
+        out = self.deconv1(x)
+        gate = self.sigmoid(self.deconv2(x))
+        return out * gate
+
+
+class DenselyConnectedBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        hid_channels=8,
+        kernel_size=(1, 3),
+        padding=(0, 1),
+        last_kernel_size=(1, 4),  # use (1, 4) to alleviate the checkerboard artifacts
+        last_stride=(1, 2),
+        last_padding=(0, 1),
+        last_output_padding=(0, 0),
+        layers=5,
+        transposed=False,
+    ):
+        """Densely-Connected Convolutional Block.
+
+        Args:
+            in_channels (int): number of input channels
+            out_channels (int): number of output channels
+            hid_channels (int): number of output channels in intermediate Conv layers
+            kernel_size (tuple): kernel size for all but the last Conv layers
+            padding (tuple): padding for all but the last Conv layers
+            last_kernel_size (tuple): kernel size for the last GluConv layer
+            last_stride (tuple): stride for the last GluConv layer
+            last_padding (tuple): padding for the last GluConv layer
+            last_output_padding (tuple): output padding for the last GluConvTranspose2d
+                 (only used when `transposed=True`)
+            layers (int): total number of Conv layers
+            transposed (bool): True to use GluConvTranspose2d in the last layer
+                               False to use GluConv2d in the last layer
+        """
+        super().__init__()
+
+        assert layers > 1, layers
+        self.conv = nn.ModuleList()
+        in_channel = in_channels
+        # here T=42 and D=127 are random integers that should not be changed after Conv
+        T, D = 42, 127
+        hidden_sizes = [127]
+        for _ in range(layers - 1):
+            self.conv.append(
+                nn.Sequential(
+                    nn.Conv2d(
+                        in_channel,
+                        hid_channels,
+                        kernel_size=kernel_size,
+                        stride=(1, 1),
+                        padding=padding,
+                    ),
+                    nn.BatchNorm2d(hid_channels),
+                    nn.ELU(inplace=True),
+                )
+            )
+            in_channel = in_channel + hid_channels
+            # make sure the last two dimensions will not be changed after this layer
+            tdim, hdim = conv2d_output_shape(
+                (T, D),
+                kernel_size=kernel_size,
+                stride=(1, 1),
+                pad=padding,
+            )
+            hidden_sizes.append(hdim)
+            assert tdim == T and hdim == D, (tdim, hdim, T, D)
+
+        if transposed:
+            self.conv.append(
+                GluConvTranspose2d(
+                    in_channel,
+                    out_channels,
+                    kernel_size=last_kernel_size,
+                    stride=last_stride,
+                    padding=last_padding,
+                    output_padding=last_output_padding,
+                )
+            )
+        else:
+            self.conv.append(
+                GluConv2d(
+                    in_channel,
+                    out_channels,
+                    kernel_size=last_kernel_size,
+                    stride=last_stride,
+                    padding=last_padding,
+                )
+            )
+
+    def forward(self, input):
+        """DenselyConnectedBlock forward.
+
+        Args:
+            input (torch.Tensor): (B, C, T_in, F_in)
+        Returns:
+            out (torch.Tensor): (B, C, T_out, F_out)
+        """
+        out = self.conv[0](input)
+        outputs = [input, out]
+        num_layers = len(self.conv)
+        for idx, layer in enumerate(self.conv[1:]):
+            out = layer(torch.cat(outputs, dim=1))
+            if idx < num_layers - 1:
+                outputs.append(out)
+        return out
+
+
+class DC_CRN(nn.Module):
+    def __init__(
+        self,
+        input_dim,
+        input_channels: List = [2, 16, 32, 64, 128, 256],
+        enc_hid_channels=8,
+        enc_kernel_size=(1, 3),
+        enc_padding=(0, 1),
+        enc_last_kernel_size=(1, 4),
+        enc_last_stride=(1, 2),
+        enc_last_padding=(0, 1),
+        enc_layers=5,
+        skip_last_kernel_size=(1, 3),
+        skip_last_stride=(1, 1),
+        skip_last_padding=(0, 1),
+        glstm_groups=2,
+        glstm_layers=2,
+        glstm_bidirectional=False,
+        glstm_rearrange=False,
+        output_channels=2,
+    ):
+        """Densely-Connected Convolutional Recurrent Network (DC-CRN).
+
+        Reference: Fig. 3 and Section III-B in [1]
+
+        Args:
+            input_dim (int): input feature dimension
+            input_channels (list): number of input channels for the stacked
+                DenselyConnectedBlock layers
+                Its length should be (`number of DenselyConnectedBlock layers`).
+                It is recommended to use even number of channels to avoid AssertError
+                when `glstm_bidirectional=True`.
+            enc_hid_channels (int): common number of intermediate channels for all
+                DenselyConnectedBlock of the encoder
+            enc_kernel_size (tuple): common kernel size for all DenselyConnectedBlock
+                of the encoder
+            enc_padding (tuple): common padding for all DenselyConnectedBlock
+                of the encoder
+            enc_last_kernel_size (tuple): common kernel size for the last Conv layer
+                in all DenselyConnectedBlock of the encoder
+            enc_last_stride (tuple): common stride for the last Conv layer in all
+                DenselyConnectedBlock of the encoder
+            enc_last_padding (tuple): common padding for the last Conv layer in all
+                DenselyConnectedBlock of the encoder
+            enc_layers (int): common total number of Conv layers for all
+                DenselyConnectedBlock layers of the encoder
+            skip_last_kernel_size (tuple): common kernel size for the last Conv layer
+                in all DenselyConnectedBlock of the skip pathways
+            skip_last_stride (tuple): common stride for the last Conv layer in all
+                DenselyConnectedBlock of the skip pathways
+            skip_last_padding (tuple): common padding for the last Conv layer in all
+                DenselyConnectedBlock of the skip pathways
+            glstm_groups (int): number of groups in each Grouped LSTM layer
+            glstm_layers (int): number of Grouped LSTM layers
+            glstm_bidirectional (bool): whether to use BLSTM or unidirectional LSTM
+                in Grouped LSTM layers
+            glstm_rearrange (bool): whether to apply the rearrange operation after each
+                grouped LSTM layer
+            output_channels (int): number of output channels (must be an even number to
+                recover both real and imaginary parts)
+        """
+        super().__init__()
+
+        assert output_channels % 2 == 0, output_channels
+        self.conv_enc = nn.ModuleList()
+        # here T=42 is a random integer that should not be changed after Conv
+        T = 42
+        hidden_sizes = [input_dim]
+        hdim = input_dim
+        for i in range(1, len(input_channels)):
+            self.conv_enc.append(
+                DenselyConnectedBlock(
+                    in_channels=input_channels[i - 1],
+                    out_channels=input_channels[i],
+                    hid_channels=enc_hid_channels,
+                    kernel_size=enc_kernel_size,
+                    padding=enc_padding,
+                    last_kernel_size=enc_last_kernel_size,
+                    last_stride=enc_last_stride,
+                    last_padding=enc_last_padding,
+                    layers=enc_layers,
+                    transposed=False,
+                )
+            )
+            tdim, hdim = conv2d_output_shape(
+                (T, hdim),
+                kernel_size=enc_last_kernel_size,
+                stride=enc_last_stride,
+                pad=enc_last_padding,
+            )
+            hidden_sizes.append(hdim)
+            assert tdim == T, (tdim, hdim)
+
+        hs = hdim * input_channels[-1]
+        assert hs >= glstm_groups, (hs, glstm_groups)
+        self.glstm = GLSTM(
+            hidden_size=hs,
+            groups=glstm_groups,
+            layers=glstm_layers,
+            bidirectional=glstm_bidirectional,
+            rearrange=glstm_rearrange,
+        )
+
+        self.skip_pathway = nn.ModuleList()
+        self.deconv_dec = nn.ModuleList()
+        for i in range(len(input_channels) - 1, 0, -1):
+            self.skip_pathway.append(
+                DenselyConnectedBlock(
+                    in_channels=input_channels[i],
+                    out_channels=input_channels[i],
+                    hid_channels=enc_hid_channels,
+                    kernel_size=enc_kernel_size,
+                    padding=enc_padding,
+                    last_kernel_size=skip_last_kernel_size,
+                    last_stride=skip_last_stride,
+                    last_padding=skip_last_padding,
+                    layers=enc_layers,
+                    transposed=False,
+                )
+            )
+            # make sure the last two dimensions will not be changed after this layer
+            enc_hdim = hidden_sizes[i]
+            tdim, hdim = conv2d_output_shape(
+                (T, enc_hdim),
+                kernel_size=skip_last_kernel_size,
+                stride=skip_last_stride,
+                pad=skip_last_padding,
+            )
+            assert tdim == T and hdim == enc_hdim, (tdim, hdim, T, enc_hdim)
+
+            if i != 1:
+                out_ch = input_channels[i - 1]
+            else:
+                out_ch = output_channels
+            # make sure the last but one dimension will not be changed after this layer
+            tdim, hdim = convtransp2d_output_shape(
+                (T, enc_hdim),
+                kernel_size=enc_last_kernel_size,
+                stride=enc_last_stride,
+                pad=enc_last_padding,
+            )
+            assert tdim == T, (tdim, hdim)
+            hpadding = hidden_sizes[i - 1] - hdim
+            assert hpadding >= 0, (hidden_sizes[i - 1], hdim)
+            self.deconv_dec.append(
+                DenselyConnectedBlock(
+                    in_channels=input_channels[i] * 2,
+                    out_channels=out_ch,
+                    hid_channels=enc_hid_channels,
+                    kernel_size=enc_kernel_size,
+                    padding=enc_padding,
+                    last_kernel_size=enc_last_kernel_size,
+                    last_stride=enc_last_stride,
+                    last_padding=enc_last_padding,
+                    last_output_padding=(0, hpadding),
+                    layers=enc_layers,
+                    transposed=True,
+                )
+            )
+
+        self.fc_real = nn.Linear(in_features=input_dim, out_features=input_dim)
+        self.fc_imag = nn.Linear(in_features=input_dim, out_features=input_dim)
+
+    def forward(self, x):
+        """DC-CRN forward.
+
+        Args:
+            x (torch.Tensor): Concatenated real and imaginary spectrum features
+                (B, input_channels[0], T, F)
+        Returns:
+            out (torch.Tensor): (B, 2, output_channels, T, F)
+        """
+        out = x
+        conv_out = []
+        for idx, layer in enumerate(self.conv_enc):
+            out = layer(out)
+            conv_out.append(out)
+
+        num_out = len(conv_out)
+        out = self.glstm(conv_out[-1])
+        res = self.skip_pathway[0](conv_out[-1])
+        out = torch.cat((out, res), dim=1)
+
+        for idx in range(len(self.deconv_dec) - 1):
+            deconv_out = self.deconv_dec[idx](out)
+            res = self.skip_pathway[idx + 1](conv_out[num_out - idx - 2])
+            out = torch.cat((deconv_out, res), dim=1)
+        out = self.deconv_dec[-1](out)
+
+        dout_real, dout_imag = torch.chunk(out, 2, dim=1)
+
+        out_real = self.fc_real(dout_real)
+        out_imag = self.fc_imag(dout_imag)
+        out = torch.stack([out_real, out_imag], dim=1)
+
+        return out
diff --git a/espnet2/enh/layers/dprnn.py b/espnet2/enh/layers/dprnn.py
index aae6040f74f..830e3c59a5e 100644
--- a/espnet2/enh/layers/dprnn.py
+++ b/espnet2/enh/layers/dprnn.py
@@ -171,6 +171,175 @@ def forward(self, input):
         return output
 
 
+# dual-path RNN with transform-average-concatenate (TAC)
+class DPRNN_TAC(nn.Module):
+    """Deep duaL-path RNN with TAC applied to each layer/block.
+
+    args:
+        rnn_type: string, select from 'RNN', 'LSTM' and 'GRU'.
+        input_size: int, dimension of the input feature. The input should
+                    have shape (batch, seq_len, input_size).
+        hidden_size: int, dimension of the hidden state.
+        output_size: int, dimension of the output size.
+        dropout: float, dropout ratio. Default is 0.
+        num_layers: int, number of stacked RNN layers. Default is 1.
+        bidirectional: bool, whether the RNN layers are bidirectional.
+                    Default is False.
+    """
+
+    def __init__(
+        self,
+        rnn_type,
+        input_size,
+        hidden_size,
+        output_size,
+        dropout=0,
+        num_layers=1,
+        bidirectional=True,
+    ):
+        super(DPRNN_TAC, self).__init__()
+
+        self.input_size = input_size
+        self.output_size = output_size
+        self.hidden_size = hidden_size
+
+        # DPRNN + TAC for 3D input (ch, N, T)
+        self.row_rnn = nn.ModuleList([])
+        self.col_rnn = nn.ModuleList([])
+        self.ch_transform = nn.ModuleList([])
+        self.ch_average = nn.ModuleList([])
+        self.ch_concat = nn.ModuleList([])
+
+        self.row_norm = nn.ModuleList([])
+        self.col_norm = nn.ModuleList([])
+        self.ch_norm = nn.ModuleList([])
+
+        for i in range(num_layers):
+            self.row_rnn.append(
+                SingleRNN(
+                    rnn_type, input_size, hidden_size, dropout, bidirectional=True
+                )
+            )  # intra-segment RNN is always noncausal
+            self.col_rnn.append(
+                SingleRNN(
+                    rnn_type,
+                    input_size,
+                    hidden_size,
+                    dropout,
+                    bidirectional=bidirectional,
+                )
+            )
+            self.ch_transform.append(
+                nn.Sequential(nn.Linear(input_size, hidden_size * 3), nn.PReLU())
+            )
+            self.ch_average.append(
+                nn.Sequential(nn.Linear(hidden_size * 3, hidden_size * 3), nn.PReLU())
+            )
+            self.ch_concat.append(
+                nn.Sequential(nn.Linear(hidden_size * 6, input_size), nn.PReLU())
+            )
+
+            self.row_norm.append(nn.GroupNorm(1, input_size, eps=1e-8))
+            # default is to use noncausal LayerNorm for
+            # inter-chunk RNN and TAC modules.
+            # For causal setting change them to causal normalization
+            # techniques accordingly.
+            self.col_norm.append(nn.GroupNorm(1, input_size, eps=1e-8))
+            self.ch_norm.append(nn.GroupNorm(1, input_size, eps=1e-8))
+
+        # output layer
+        self.output = nn.Sequential(nn.PReLU(), nn.Conv2d(input_size, output_size, 1))
+
+    def forward(self, input, num_mic):
+        # input shape: batch, ch, N, dim1, dim2
+        # num_mic shape: batch,
+        # apply RNN on dim1 first, then dim2, then ch
+
+        batch_size, ch, N, dim1, dim2 = input.shape
+        output = input
+        for i in range(len(self.row_rnn)):
+            # intra-segment RNN
+            output = output.view(batch_size * ch, N, dim1, dim2)
+            row_input = (
+                output.permute(0, 3, 2, 1)
+                .contiguous()
+                .view(batch_size * ch * dim2, dim1, -1)
+            )  # B*ch*dim2, dim1, N
+            row_output = self.row_rnn[i](row_input)  # B*ch*dim2, dim1, N
+            row_output = (
+                row_output.view(batch_size * ch, dim2, dim1, -1)
+                .permute(0, 3, 2, 1)
+                .contiguous()
+            )  # B*ch, N, dim1, dim2
+            row_output = self.row_norm[i](row_output)
+            output = output + row_output  # B*ch, N, dim1, dim2
+
+            # inter-segment RNN
+            col_input = (
+                output.permute(0, 2, 3, 1)
+                .contiguous()
+                .view(batch_size * ch * dim1, dim2, -1)
+            )  # B*ch*dim1, dim2, N
+            col_output = self.col_rnn[i](col_input)  # B*dim1, dim2, N
+            col_output = (
+                col_output.view(batch_size * ch, dim1, dim2, -1)
+                .permute(0, 3, 1, 2)
+                .contiguous()
+            )  # B*ch, N, dim1, dim2
+            col_output = self.col_norm[i](col_output)
+            output = output + col_output  # B*ch, N, dim1, dim2
+
+            # TAC for cross-channel communication
+            ch_input = output.view(input.shape)  # B, ch, N, dim1, dim2
+            ch_input = (
+                ch_input.permute(0, 3, 4, 1, 2).contiguous().view(-1, N)
+            )  # B*dim1*dim2*ch, N
+            ch_output = self.ch_transform[i](ch_input).view(
+                batch_size, dim1 * dim2, ch, -1
+            )  # B, dim1*dim2, ch, H
+            # mean pooling across channels
+            if num_mic.max() == 0:
+                # fixed geometry array
+                ch_mean = ch_output.mean(2).view(
+                    batch_size * dim1 * dim2, -1
+                )  # B*dim1*dim2, H
+            else:
+                # only consider valid channels
+                ch_mean = [
+                    ch_output[b, :, : num_mic[b]].mean(1).unsqueeze(0)
+                    for b in range(batch_size)
+                ]  # 1, dim1*dim2, H
+                ch_mean = torch.cat(ch_mean, 0).view(
+                    batch_size * dim1 * dim2, -1
+                )  # B*dim1*dim2, H
+            ch_output = ch_output.view(
+                batch_size * dim1 * dim2, ch, -1
+            )  # B*dim1*dim2, ch, H
+            ch_mean = (
+                self.ch_average[i](ch_mean)
+                .unsqueeze(1)
+                .expand_as(ch_output)
+                .contiguous()
+            )  # B*dim1*dim2, ch, H
+            ch_output = torch.cat([ch_output, ch_mean], 2)  # B*dim1*dim2, ch, 2H
+            ch_output = self.ch_concat[i](
+                ch_output.view(-1, ch_output.shape[-1])
+            )  # B*dim1*dim2*ch, N
+            ch_output = (
+                ch_output.view(batch_size, dim1, dim2, ch, -1)
+                .permute(0, 3, 4, 1, 2)
+                .contiguous()
+            )  # B, ch, N, dim1, dim2
+            ch_output = self.ch_norm[i](
+                ch_output.view(batch_size * ch, N, dim1, dim2)
+            )  # B*ch, N, dim1, dim2
+            output = output + ch_output
+
+        output = self.output(output)  # B*ch, N, dim1, dim2
+
+        return output
+
+
 def _pad_segment(input, segment_size):
     # input is the features: (B, N, T)
     batch_size, dim, seq_len = input.shape
diff --git a/espnet2/enh/layers/fasnet.py b/espnet2/enh/layers/fasnet.py
new file mode 100644
index 00000000000..3788fbb08c7
--- /dev/null
+++ b/espnet2/enh/layers/fasnet.py
@@ -0,0 +1,448 @@
+# The implementation of FaSNet in
+# Y. Luo, et al.  “FaSNet: Low-Latency Adaptive Beamforming
+# for Multi-Microphone Audio Processing”
+# The implementation is based on:
+# https://github.com/yluo42/TAC
+# Licensed under CC BY-NC-SA 3.0 US.
+#
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from espnet2.enh.layers import dprnn
+
+
+# DPRNN for beamforming filter estimation
+class BF_module(nn.Module):
+    def __init__(
+        self,
+        input_dim,
+        feature_dim,
+        hidden_dim,
+        output_dim,
+        num_spk=2,
+        layer=4,
+        segment_size=100,
+        bidirectional=True,
+        dropout=0.0,
+        fasnet_type="ifasnet",
+    ):
+        super().__init__()
+
+        assert fasnet_type in [
+            "fasnet",
+            "ifasnet",
+        ], "fasnet_type should be fasnet or ifasnet"
+
+        self.input_dim = input_dim
+        self.feature_dim = feature_dim
+        self.hidden_dim = hidden_dim
+        self.output_dim = output_dim
+
+        self.layer = layer
+        self.segment_size = segment_size
+        self.num_spk = num_spk
+
+        self.dprnn_model = dprnn.DPRNN_TAC(
+            "lstm",
+            self.feature_dim,
+            self.hidden_dim,
+            self.feature_dim * self.num_spk,
+            num_layers=layer,
+            bidirectional=bidirectional,
+            dropout=dropout,
+        )
+        self.eps = 1e-8
+
+        self.fasnet_type = fasnet_type
+
+        if fasnet_type == "ifasnet":
+            # output layer in ifasnet
+            self.output = nn.Conv1d(self.feature_dim, self.output_dim, 1)
+        elif fasnet_type == "fasnet":
+            # gated output layer in ifasnet
+            self.output = nn.Sequential(
+                nn.Conv1d(self.feature_dim, self.output_dim, 1), nn.Tanh()
+            )
+            self.output_gate = nn.Sequential(
+                nn.Conv1d(self.feature_dim, self.output_dim, 1), nn.Sigmoid()
+            )
+
+        self.num_spk = num_spk
+        self.BN = nn.Conv1d(self.input_dim, self.feature_dim, 1, bias=False)
+
+    def forward(self, input, num_mic):
+
+        # input: (B, ch, N, T)
+        batch_size, ch, N, seq_length = input.shape
+
+        input = input.view(batch_size * ch, N, seq_length)  # B*ch, N, T
+        enc_feature = self.BN(input)
+
+        # split the encoder output into overlapped, longer segments
+        enc_segments, enc_rest = dprnn.split_feature(
+            enc_feature, self.segment_size
+        )  # B*ch, N, L, K
+
+        enc_segments = enc_segments.view(
+            batch_size, ch, -1, enc_segments.shape[2], enc_segments.shape[3]
+        )  # B, ch, N, L, K
+        output = self.dprnn_model(enc_segments, num_mic).view(
+            batch_size * ch * self.num_spk,
+            self.feature_dim,
+            self.segment_size,
+            -1,
+        )  # B*ch*nspk, N, L, K
+        # overlap-and-add of the outputs
+        output = dprnn.merge_feature(output, enc_rest)  # B*ch*nspk, N, T
+
+        if self.fasnet_type == "fasnet":
+            # gated output layer for filter generation
+            bf_filter = self.output(output) * self.output_gate(
+                output
+            )  # B*ch*nspk, K, T
+            bf_filter = (
+                bf_filter.transpose(1, 2)
+                .contiguous()
+                .view(batch_size, ch, self.num_spk, -1, self.output_dim)
+            )  # B, ch, nspk, L, N
+
+        elif self.fasnet_type == "ifasnet":
+            # output layer
+            bf_filter = self.output(output)  # B*ch*nspk, K, T
+            bf_filter = bf_filter.view(
+                batch_size, ch, self.num_spk, self.output_dim, -1
+            )  # B, ch, nspk, K, L
+
+        return bf_filter
+
+
+# base module for FaSNet
+class FaSNet_base(nn.Module):
+    def __init__(
+        self,
+        enc_dim,
+        feature_dim,
+        hidden_dim,
+        layer,
+        segment_size=24,
+        nspk=2,
+        win_len=16,
+        context_len=16,
+        dropout=0.0,
+        sr=16000,
+    ):
+        super(FaSNet_base, self).__init__()
+
+        # parameters
+        self.win_len = win_len
+        self.window = max(int(sr * win_len / 1000), 2)
+        self.stride = self.window // 2
+        self.sr = sr
+        self.context_len = context_len
+        self.dropout = dropout
+
+        self.enc_dim = enc_dim
+        self.feature_dim = feature_dim
+        self.hidden_dim = hidden_dim
+        self.segment_size = segment_size
+
+        self.layer = layer
+        self.num_spk = nspk
+        self.eps = 1e-8
+
+    def pad_input(self, input, window):
+        """Zero-padding input according to window/stride size."""
+
+        batch_size, nmic, nsample = input.shape
+
+        stride = self.stride
+
+        # pad the signals at the end for matching the window/stride size
+        rest = window - (stride + nsample % window) % window
+        if rest > 0:
+            pad = torch.zeros(batch_size, nmic, rest).type(input.type())
+            input = torch.cat([input, pad], 2)
+        pad_aux = torch.zeros(batch_size, nmic, stride).type(input.type())
+        input = torch.cat([pad_aux, input, pad_aux], 2)
+
+        return input, rest
+
+    def seg_signal_context(self, x, window, context):
+        """Segmenting the signal into chunks with specific context.
+
+        input:
+            x: size (B, ch, T)
+            window: int
+            context: int
+        """
+
+        # pad input accordingly
+        # first pad according to window size
+        input, rest = self.pad_input(x, window)
+        batch_size, nmic, nsample = input.shape
+        stride = window // 2
+
+        # pad another context size
+        pad_context = torch.zeros(batch_size, nmic, context).type(input.type())
+        input = torch.cat([pad_context, input, pad_context], 2)  # B, ch, L
+
+        # calculate index for each chunk
+        nchunk = 2 * nsample // window - 1
+        begin_idx = np.arange(nchunk) * stride
+        begin_idx = (
+            torch.from_numpy(begin_idx).type(input.type()).long().view(1, 1, -1)
+        )  # 1, 1, nchunk
+        begin_idx = begin_idx.expand(batch_size, nmic, nchunk)  # B, ch, nchunk
+        # select entries from index
+        chunks = [
+            torch.gather(input, 2, begin_idx + i).unsqueeze(3)
+            for i in range(2 * context + window)
+        ]  # B, ch, nchunk, 1
+        chunks = torch.cat(chunks, 3)  # B, ch, nchunk, chunk_size
+
+        # center frame
+        center_frame = chunks[:, :, :, context : context + window]
+
+        return center_frame, chunks, rest
+
+    def signal_context(self, x, context):
+        """signal context function
+
+        Segmenting the signal into chunks with specific context.
+        input:
+            x: size (B, dim, nframe)
+            context: int
+        """
+
+        batch_size, dim, nframe = x.shape
+
+        zero_pad = torch.zeros(batch_size, dim, context).type(x.type())
+        pad_past = []
+        pad_future = []
+        for i in range(context):
+            pad_past.append(
+                torch.cat([zero_pad[:, :, i:], x[:, :, : -context + i]], 2).unsqueeze(2)
+            )
+            pad_future.append(
+                torch.cat([x[:, :, i + 1 :], zero_pad[:, :, : i + 1]], 2).unsqueeze(2)
+            )
+
+        pad_past = torch.cat(pad_past, 2)  # B, D, C, L
+        pad_future = torch.cat(pad_future, 2)  # B, D, C, L
+        all_context = torch.cat(
+            [pad_past, x.unsqueeze(2), pad_future], 2
+        )  # B, D, 2*C+1, L
+
+        return all_context
+
+    def seq_cos_sim(self, ref, target):
+        """Cosine similarity between some reference mics and some target mics
+
+        ref: shape (nmic1, L, seg1)
+        target: shape (nmic2, L, seg2)
+        """
+
+        assert ref.size(1) == target.size(1), "Inputs should have same length."
+        assert ref.size(2) >= target.size(
+            2
+        ), "Reference input should be no smaller than the target input."
+
+        seq_length = ref.size(1)
+
+        larger_ch = ref.size(0)
+        if target.size(0) > ref.size(0):
+            ref = ref.expand(
+                target.size(0), ref.size(1), ref.size(2)
+            ).contiguous()  # nmic2, L, seg1
+            larger_ch = target.size(0)
+        elif target.size(0) < ref.size(0):
+            target = target.expand(
+                ref.size(0), target.size(1), target.size(2)
+            ).contiguous()  # nmic1, L, seg2
+
+        # L2 norms
+        ref_norm = F.conv1d(
+            ref.view(1, -1, ref.size(2)).pow(2),
+            torch.ones(ref.size(0) * ref.size(1), 1, target.size(2)).type(ref.type()),
+            groups=larger_ch * seq_length,
+        )  # 1, larger_ch*L, seg1-seg2+1
+        ref_norm = ref_norm.sqrt() + self.eps
+        target_norm = (
+            target.norm(2, dim=2).view(1, -1, 1) + self.eps
+        )  # 1, larger_ch*L, 1
+        # cosine similarity
+        cos_sim = F.conv1d(
+            ref.view(1, -1, ref.size(2)),
+            target.view(-1, 1, target.size(2)),
+            groups=larger_ch * seq_length,
+        )  # 1, larger_ch*L, seg1-seg2+1
+        cos_sim = cos_sim / (ref_norm * target_norm)
+
+        return cos_sim.view(larger_ch, seq_length, -1)
+
+    def forward(self, input, num_mic):
+        """abstract forward function
+
+        input: shape (batch, max_num_ch, T)
+        num_mic: shape (batch, ), the number of channels for each input.
+                 Zero for fixed geometry configuration.
+        """
+        pass
+
+
+# single-stage FaSNet + TAC
+class FaSNet_TAC(FaSNet_base):
+    def __init__(self, *args, **kwargs):
+        super(FaSNet_TAC, self).__init__(*args, **kwargs)
+
+        self.context = int(self.sr * self.context_len / 1000)
+        self.filter_dim = self.context * 2 + 1
+
+        # DPRNN + TAC for estimation
+        self.all_BF = BF_module(
+            self.filter_dim + self.enc_dim,
+            self.feature_dim,
+            self.hidden_dim,
+            self.filter_dim,
+            self.num_spk,
+            self.layer,
+            self.segment_size,
+            dropout=self.dropout,
+            fasnet_type="fasnet",
+        )
+
+        # waveform encoder
+        self.encoder = nn.Conv1d(
+            1, self.enc_dim, self.context * 2 + self.window, bias=False
+        )
+        self.enc_LN = nn.GroupNorm(1, self.enc_dim, eps=1e-8)
+
+    def forward(self, input, num_mic):
+
+        batch_size = input.size(0)
+        nmic = input.size(1)
+
+        # split input into chunks
+        all_seg, all_mic_context, rest = self.seg_signal_context(
+            input, self.window, self.context
+        )  # B, nmic, L, win/chunk
+        seq_length = all_seg.size(2)
+
+        # embeddings for all channels
+        enc_output = (
+            self.encoder(all_mic_context.view(-1, 1, self.context * 2 + self.window))
+            .view(batch_size * nmic, seq_length, self.enc_dim)
+            .transpose(1, 2)
+            .contiguous()
+        )  # B*nmic, N, L
+        enc_output = self.enc_LN(enc_output).view(
+            batch_size, nmic, self.enc_dim, seq_length
+        )  # B, nmic, N, L
+
+        # calculate the cosine similarities for ref channel's center
+        # frame with all channels' context
+
+        ref_seg = all_seg[:, 0].contiguous().view(1, -1, self.window)  # 1, B*L, win
+        all_context = (
+            all_mic_context.transpose(0, 1)
+            .contiguous()
+            .view(nmic, -1, self.context * 2 + self.window)
+        )  # 1, B*L, 3*win
+        all_cos_sim = self.seq_cos_sim(all_context, ref_seg)  # nmic, B*L, 2*win+1
+        all_cos_sim = (
+            all_cos_sim.view(nmic, batch_size, seq_length, self.filter_dim)
+            .permute(1, 0, 3, 2)
+            .contiguous()
+        )  # B, nmic, 2*win+1, L
+
+        input_feature = torch.cat([enc_output, all_cos_sim], 2)  # B, nmic, N+2*win+1, L
+
+        # pass to DPRNN
+        all_filter = self.all_BF(input_feature, num_mic)  # B, ch, nspk, L, 2*win+1
+
+        # convolve with all mic's context
+        mic_context = torch.cat(
+            [
+                all_mic_context.view(
+                    batch_size * nmic, 1, seq_length, self.context * 2 + self.window
+                )
+            ]
+            * self.num_spk,
+            1,
+        )  # B*nmic, nspk, L, 3*win
+        all_bf_output = F.conv1d(
+            mic_context.view(1, -1, self.context * 2 + self.window),
+            all_filter.view(-1, 1, self.filter_dim),
+            groups=batch_size * nmic * self.num_spk * seq_length,
+        )  # 1, B*nmic*nspk*L, win
+        all_bf_output = all_bf_output.view(
+            batch_size, nmic, self.num_spk, seq_length, self.window
+        )  # B, nmic, nspk, L, win
+
+        # reshape to utterance
+        bf_signal = all_bf_output.view(
+            batch_size * nmic * self.num_spk, -1, self.window * 2
+        )
+        bf_signal1 = (
+            bf_signal[:, :, : self.window]
+            .contiguous()
+            .view(batch_size * nmic * self.num_spk, 1, -1)[:, :, self.stride :]
+        )
+        bf_signal2 = (
+            bf_signal[:, :, self.window :]
+            .contiguous()
+            .view(batch_size * nmic * self.num_spk, 1, -1)[:, :, : -self.stride]
+        )
+        bf_signal = bf_signal1 + bf_signal2  # B*nmic*nspk, 1, T
+        if rest > 0:
+            bf_signal = bf_signal[:, :, :-rest]
+
+        bf_signal = bf_signal.view(
+            batch_size, nmic, self.num_spk, -1
+        )  # B, nmic, nspk, T
+        # consider only the valid channels
+        if num_mic.max() == 0:
+            bf_signal = bf_signal.mean(1)  # B, nspk, T
+        else:
+            bf_signal = [
+                bf_signal[b, : num_mic[b]].mean(0).unsqueeze(0)
+                for b in range(batch_size)
+            ]  # nspk, T
+            bf_signal = torch.cat(bf_signal, 0)  # B, nspk, T
+
+        return bf_signal
+
+
+def test_model(model):
+    x = torch.rand(2, 4, 32000)  # (batch, num_mic, length)
+    num_mic = (
+        torch.from_numpy(np.array([3, 2]))
+        .view(
+            -1,
+        )
+        .type(x.type())
+    )  # ad-hoc array
+    none_mic = torch.zeros(1).type(x.type())  # fixed-array
+    y1 = model(x, num_mic.long())
+    y2 = model(x, none_mic.long())
+    print(y1.shape, y2.shape)  # (batch, nspk, length)
+
+
+if __name__ == "__main__":
+
+    model_TAC = FaSNet_TAC(
+        enc_dim=64,
+        feature_dim=64,
+        hidden_dim=128,
+        layer=4,
+        segment_size=50,
+        nspk=2,
+        win_len=4,
+        context_len=16,
+        sr=16000,
+    )
+
+    test_model(model_TAC)
diff --git a/espnet2/enh/layers/ifasnet.py b/espnet2/enh/layers/ifasnet.py
new file mode 100644
index 00000000000..076898f4b2d
--- /dev/null
+++ b/espnet2/enh/layers/ifasnet.py
@@ -0,0 +1,220 @@
+# The implementation of iFaSNet in
+# Luo. et al. "Implicit Filter-and-sum Network for
+# Multi-channel Speech Separation"
+#
+# The implementation is based on:
+# https://github.com/yluo42/TAC
+# Licensed under CC BY-NC-SA 3.0 US.
+#
+
+import torch
+import torch.nn as nn
+
+from espnet2.enh.layers import dprnn
+from espnet2.enh.layers.fasnet import BF_module
+from espnet2.enh.layers.fasnet import FaSNet_base
+
+
+# implicit FaSNet (iFaSNet)
+class iFaSNet(FaSNet_base):
+    def __init__(self, *args, **kwargs):
+        super(iFaSNet, self).__init__(*args, **kwargs)
+
+        self.context = self.context_len * 2 // self.win_len
+        # context compression
+        self.summ_BN = nn.Linear(self.enc_dim, self.feature_dim)
+        self.summ_RNN = dprnn.SingleRNN(
+            "LSTM", self.feature_dim, self.hidden_dim, bidirectional=True
+        )
+        self.summ_LN = nn.GroupNorm(1, self.feature_dim, eps=self.eps)
+        self.summ_output = nn.Linear(self.feature_dim, self.enc_dim)
+
+        self.separator = BF_module(
+            self.enc_dim + (self.context * 2 + 1) ** 2,
+            self.feature_dim,
+            self.hidden_dim,
+            self.enc_dim,
+            self.num_spk,
+            self.layer,
+            self.segment_size,
+            dropout=self.dropout,
+            fasnet_type="ifasnet",
+        )
+
+        # waveform encoder/decoder
+        self.encoder = nn.Conv1d(
+            1, self.enc_dim, self.window, stride=self.stride, bias=False
+        )
+        self.decoder = nn.ConvTranspose1d(
+            self.enc_dim, 1, self.window, stride=self.stride, bias=False
+        )
+        self.enc_LN = nn.GroupNorm(1, self.enc_dim, eps=self.eps)
+
+        # context decompression
+        self.gen_BN = nn.Conv1d(self.enc_dim * 2, self.feature_dim, 1)
+        self.gen_RNN = dprnn.SingleRNN(
+            "LSTM", self.feature_dim, self.hidden_dim, bidirectional=True
+        )
+        self.gen_LN = nn.GroupNorm(1, self.feature_dim, eps=self.eps)
+        self.gen_output = nn.Conv1d(self.feature_dim, self.enc_dim, 1)
+
+    def forward(self, input, num_mic):
+
+        batch_size = input.size(0)
+        nmic = input.size(1)
+
+        # pad input accordingly
+        input, rest = self.pad_input(input, self.window)
+
+        # encoder on all channels
+        enc_output = self.encoder(input.view(batch_size * nmic, 1, -1))  # B*nmic, N, L
+        seq_length = enc_output.shape[-1]
+
+        # calculate the context of the encoder output
+        # consider both past and future
+        enc_context = self.signal_context(
+            enc_output, self.context
+        )  # B*nmic, N, 2C+1, L
+        enc_context = enc_context.view(
+            batch_size, nmic, self.enc_dim, -1, seq_length
+        )  # B, nmic, N, 2C+1, L
+
+        # NCC feature
+        ref_enc = enc_context[:, 0].contiguous()  # B, N, 2C+1, L
+        ref_enc = (
+            ref_enc.permute(0, 3, 1, 2)
+            .contiguous()
+            .view(batch_size * seq_length, self.enc_dim, -1)
+        )  # B*L, N, 2C+1
+        enc_context_copy = (
+            enc_context.permute(0, 4, 1, 3, 2)
+            .contiguous()
+            .view(batch_size * seq_length, nmic, -1, self.enc_dim)
+        )  # B*L, nmic, 2C+1, N
+        NCC = torch.cat(
+            [enc_context_copy[:, i].bmm(ref_enc).unsqueeze(1) for i in range(nmic)], 1
+        )  # B*L, nmic, 2C+1, 2C+1
+        ref_norm = (
+            ref_enc.pow(2).sum(1).unsqueeze(1) + self.eps
+        ).sqrt()  # B*L, 1, 2C+1
+        enc_norm = (
+            enc_context_copy.pow(2).sum(3).unsqueeze(3) + self.eps
+        ).sqrt()  # B*L, nmic, 2C+1, 1
+        NCC = NCC / (ref_norm.unsqueeze(1) * enc_norm)  # B*L, nmic, 2C+1, 2C+1
+        NCC = torch.cat(
+            [NCC[:, :, i] for i in range(NCC.shape[2])], 2
+        )  # B*L, nmic, (2C+1)^2
+        NCC = (
+            NCC.view(batch_size, seq_length, nmic, -1).permute(0, 2, 3, 1).contiguous()
+        )  # B, nmic, (2C+1)^2, L
+
+        # context compression
+        norm_output = self.enc_LN(enc_output)  # B*nmic, N, L
+        norm_context = self.signal_context(
+            norm_output, self.context
+        )  # B*nmic, N, 2C+1, L
+        norm_context = (
+            norm_context.permute(0, 3, 2, 1)
+            .contiguous()
+            .view(-1, self.context * 2 + 1, self.enc_dim)
+        )
+        norm_context_BN = self.summ_BN(norm_context.view(-1, self.enc_dim)).view(
+            -1, self.context * 2 + 1, self.feature_dim
+        )
+        embedding = (
+            self.summ_RNN(norm_context_BN).transpose(1, 2).contiguous()
+        )  # B*nmic*L, N, 2C+1
+        embedding = norm_context_BN.transpose(1, 2).contiguous() + self.summ_LN(
+            embedding
+        )  # B*nmic*L, N, 2C+1
+        embedding = self.summ_output(embedding.mean(2)).view(
+            batch_size, nmic, seq_length, self.enc_dim
+        )  # B, nmic, L, N
+        embedding = embedding.transpose(2, 3).contiguous()  # B, nmic, N, L
+
+        input_feature = torch.cat([embedding, NCC], 2)  # B, nmic, N+(2C+1)^2, L
+
+        # pass to DPRNN-TAC
+        embedding = self.separator(input_feature, num_mic)[
+            :, 0
+        ].contiguous()  # B, nspk, N, L
+
+        # concatenate with encoder outputs and generate masks
+        # context decompression
+        norm_context = norm_context.view(
+            batch_size, nmic, seq_length, -1, self.enc_dim
+        )  # B, nmic, L, 2C+1, N
+        norm_context = norm_context.permute(0, 1, 4, 3, 2)[
+            :, :1
+        ].contiguous()  # B, 1, N, 2C+1, L
+
+        embedding = torch.cat(
+            [embedding.unsqueeze(3)] * (self.context * 2 + 1), 3
+        )  # B, nspk, N, 2C+1, L
+        norm_context = torch.cat(
+            [norm_context] * self.num_spk, 1
+        )  # B, nspk, N, 2C+1, L
+        embedding = (
+            torch.cat([norm_context, embedding], 2).permute(0, 1, 4, 2, 3).contiguous()
+        )  # B, nspk, L, 2N, 2C+1
+        all_filter = self.gen_BN(
+            embedding.view(-1, self.enc_dim * 2, self.context * 2 + 1)
+        )  # B*nspk*L, N, 2C+1
+        all_filter = all_filter + self.gen_LN(
+            self.gen_RNN(all_filter.transpose(1, 2)).transpose(1, 2)
+        )  # B*nspk*L, N, 2C+1
+        all_filter = self.gen_output(all_filter)  # B*nspk*L, N, 2C+1
+        all_filter = all_filter.view(
+            batch_size, self.num_spk, seq_length, self.enc_dim, -1
+        )  # B, nspk, L, N+1, 2C+1
+        all_filter = all_filter.permute(
+            0, 1, 3, 4, 2
+        ).contiguous()  # B, nspk, N, 2C+1, L
+
+        # apply to with ref mic's encoder context
+        output = (enc_context[:, :1] * all_filter).mean(3)  # B, nspk, N, L
+
+        # decode
+        bf_signal = self.decoder(
+            output.view(batch_size * self.num_spk, self.enc_dim, -1)
+        )  # B*nspk, 1, T
+
+        if rest > 0:
+            bf_signal = bf_signal[:, :, self.stride : -rest - self.stride]
+
+        bf_signal = bf_signal.view(batch_size, self.num_spk, -1)  # B, nspk, T
+
+        return bf_signal
+
+
+def test_model(model):
+    import numpy as np
+
+    x = torch.rand(3, 4, 32000)  # (batch, num_mic, length)
+    num_mic = (
+        torch.from_numpy(np.array([3, 3, 2]))
+        .view(
+            -1,
+        )
+        .type(x.type())
+    )  # ad-hoc array
+    none_mic = torch.zeros(1).type(x.type())  # fixed-array
+    y1 = model(x, num_mic.long())
+    y2 = model(x, none_mic.long())
+    print(y1.shape, y2.shape)  # (batch, nspk, length)
+
+
+if __name__ == "__main__":
+    model_iFaSNet = iFaSNet(
+        enc_dim=64,
+        feature_dim=64,
+        hidden_dim=128,
+        layer=6,
+        segment_size=24,
+        nspk=2,
+        win_len=16,
+        context_len=16,
+        sr=16000,
+    )
+
+    test_model(model_iFaSNet)
diff --git a/espnet2/enh/loss/criterions/tf_domain.py b/espnet2/enh/loss/criterions/tf_domain.py
index f7df31d1520..469664e9a91 100644
--- a/espnet2/enh/loss/criterions/tf_domain.py
+++ b/espnet2/enh/loss/criterions/tf_domain.py
@@ -8,6 +8,7 @@
 import torch.nn.functional as F
 
 from espnet2.enh.layers.complex_utils import is_complex
+from espnet2.enh.layers.complex_utils import new_complex_like
 from espnet2.enh.loss.criterions.abs_loss import AbsEnhLoss
 
 
@@ -28,6 +29,7 @@ def _create_mask_label(mix_spec, ref_spec, mask_type="IAM"):
     """
 
     # Must be upper case
+    mask_type = mask_type.upper()
     assert mask_type in [
         "IBM",
         "IRM",
@@ -35,6 +37,7 @@ def _create_mask_label(mix_spec, ref_spec, mask_type="IAM"):
         "PSM",
         "NPSM",
         "PSM^2",
+        "CIRM",
     ], f"mask type {mask_type} not supported"
     mask_label = []
     for r in ref_spec:
@@ -69,6 +72,12 @@ def _create_mask_label(mix_spec, ref_spec, mask_type="IAM"):
             cos_theta = phase_r.real * phase_mix.real + phase_r.imag * phase_mix.imag
             mask = (abs(r).pow(2) / (abs(mix_spec).pow(2) + EPS)) * cos_theta
             mask = mask.clamp(min=-1, max=1)
+        elif mask_type == "CIRM":
+            # Ref: Complex Ratio Masking for Monaural Speech Separation
+            denominator = mix_spec.real.pow(2) + mix_spec.imag.pow(2) + EPS
+            mask_real = (mix_spec.real * r.real + mix_spec.imag * r.imag) / denominator
+            mask_imag = (mix_spec.real * r.imag - mix_spec.imag * r.real) / denominator
+            mask = new_complex_like(mix_spec, [mask_real, mask_imag])
         assert mask is not None, f"mask type {mask_type} not supported"
         mask_label.append(mask)
     return mask_label
@@ -175,7 +184,11 @@ def forward(self, ref, inf) -> torch.Tensor:
         assert ref.shape == inf.shape, (ref.shape, inf.shape)
 
         if is_complex(inf):
-            l1loss = abs(ref - inf + EPS)
+            l1loss = (
+                abs(ref.real - inf.real)
+                + abs(ref.imag - inf.imag)
+                + abs(ref.abs() - inf.abs())
+            )
         else:
             l1loss = abs(ref - inf)
         if ref.dim() == 3:
diff --git a/espnet2/enh/separator/dc_crn_separator.py b/espnet2/enh/separator/dc_crn_separator.py
new file mode 100644
index 00000000000..fa4ed14bc89
--- /dev/null
+++ b/espnet2/enh/separator/dc_crn_separator.py
@@ -0,0 +1,171 @@
+from collections import OrderedDict
+from distutils.version import LooseVersion
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+from torch_complex.tensor import ComplexTensor
+
+from espnet2.enh.layers.complex_utils import is_complex
+from espnet2.enh.layers.complex_utils import new_complex_like
+from espnet2.enh.layers.dc_crn import DC_CRN
+from espnet2.enh.separator.abs_separator import AbsSeparator
+
+
+EPS = torch.finfo(torch.get_default_dtype()).eps
+is_torch_1_9_plus = LooseVersion(torch.__version__) >= LooseVersion("1.9.0")
+
+
+class DC_CRNSeparator(AbsSeparator):
+    def __init__(
+        self,
+        input_dim: int,
+        num_spk: int = 2,
+        input_channels: List = [2, 16, 32, 64, 128, 256],
+        enc_hid_channels: int = 8,
+        enc_kernel_size: Tuple = (1, 3),
+        enc_padding: Tuple = (0, 1),
+        enc_last_kernel_size: Tuple = (1, 4),
+        enc_last_stride: Tuple = (1, 2),
+        enc_last_padding: Tuple = (0, 1),
+        enc_layers: int = 5,
+        skip_last_kernel_size: Tuple = (1, 3),
+        skip_last_stride: Tuple = (1, 1),
+        skip_last_padding: Tuple = (0, 1),
+        glstm_groups: int = 2,
+        glstm_layers: int = 2,
+        glstm_bidirectional: bool = False,
+        glstm_rearrange: bool = False,
+        mode: str = "masking",
+        ref_channel: int = 0,
+    ):
+        """Densely-Connected Convolutional Recurrent Network (DC-CRN) Separator
+
+        Reference:
+            Deep Learning Based Real-Time Speech Enhancement for Dual-Microphone
+            Mobile Phones; Tan et al., 2020
+            https://web.cse.ohio-state.edu/~wang.77/papers/TZW.taslp21.pdf
+
+        Args:
+            input_dim: input feature dimension
+            num_spk: number of speakers
+            input_channels (list): number of input channels for the stacked
+                DenselyConnectedBlock layers
+                Its length should be (`number of DenselyConnectedBlock layers`).
+            enc_hid_channels (int): common number of intermediate channels for all
+                DenselyConnectedBlock of the encoder
+            enc_kernel_size (tuple): common kernel size for all DenselyConnectedBlock
+                of the encoder
+            enc_padding (tuple): common padding for all DenselyConnectedBlock
+                of the encoder
+            enc_last_kernel_size (tuple): common kernel size for the last Conv layer
+                in all DenselyConnectedBlock of the encoder
+            enc_last_stride (tuple): common stride for the last Conv layer in all
+                DenselyConnectedBlock of the encoder
+            enc_last_padding (tuple): common padding for the last Conv layer in all
+                DenselyConnectedBlock of the encoder
+            enc_layers (int): common total number of Conv layers for all
+                DenselyConnectedBlock layers of the encoder
+            skip_last_kernel_size (tuple): common kernel size for the last Conv layer
+                in all DenselyConnectedBlock of the skip pathways
+            skip_last_stride (tuple): common stride for the last Conv layer in all
+                DenselyConnectedBlock of the skip pathways
+            skip_last_padding (tuple): common padding for the last Conv layer in all
+                DenselyConnectedBlock of the skip pathways
+            glstm_groups (int): number of groups in each Grouped LSTM layer
+            glstm_layers (int): number of Grouped LSTM layers
+            glstm_bidirectional (bool): whether to use BLSTM or unidirectional LSTM
+                in Grouped LSTM layers
+            glstm_rearrange (bool): whether to apply the rearrange operation after each
+                grouped LSTM layer
+            output_channels (int): number of output channels (even number)
+            mode (str): one of ("mapping", "masking")
+                "mapping": complex spectral mapping
+                "masking": complex masking
+            ref_channel (int): index of the reference microphone
+        """
+        super().__init__()
+
+        self._num_spk = num_spk
+        self.mode = mode
+        if mode not in ("mapping", "masking"):
+            raise ValueError("mode=%s is not supported" % mode)
+        self.ref_channel = ref_channel
+
+        self.dc_crn = DC_CRN(
+            input_dim=input_dim,
+            input_channels=input_channels,
+            enc_hid_channels=enc_hid_channels,
+            enc_kernel_size=enc_kernel_size,
+            enc_padding=enc_padding,
+            enc_last_kernel_size=enc_last_kernel_size,
+            enc_last_stride=enc_last_stride,
+            enc_last_padding=enc_last_padding,
+            enc_layers=enc_layers,
+            skip_last_kernel_size=skip_last_kernel_size,
+            skip_last_stride=skip_last_stride,
+            skip_last_padding=skip_last_padding,
+            glstm_groups=glstm_groups,
+            glstm_layers=glstm_layers,
+            glstm_bidirectional=glstm_bidirectional,
+            glstm_rearrange=glstm_rearrange,
+            output_channels=num_spk * 2,
+        )
+
+    def forward(
+        self,
+        input: Union[torch.Tensor, ComplexTensor],
+        ilens: torch.Tensor,
+        additional: Optional[Dict] = None,
+    ) -> Tuple[List[Union[torch.Tensor, ComplexTensor]], torch.Tensor, OrderedDict]:
+        """DC-CRN Separator Forward.
+
+        Args:
+            input (torch.Tensor or ComplexTensor): Encoded feature [Batch, T, F]
+                                                   or [Batch, T, C, F]
+            ilens (torch.Tensor): input lengths [Batch,]
+
+        Returns:
+            masked (List[Union(torch.Tensor, ComplexTensor)]): [(Batch, T, F), ...]
+            ilens (torch.Tensor): (B,)
+            others predicted data, e.g. masks: OrderedDict[
+                'mask_spk1': torch.Tensor(Batch, Frames, Freq),
+                'mask_spk2': torch.Tensor(Batch, Frames, Freq),
+                ...
+                'mask_spkn': torch.Tensor(Batch, Frames, Freq),
+            ]
+        """
+        assert is_complex(input)
+        is_multichannel = input.ndim == 4
+        if is_multichannel:
+            feature = torch.cat([input.real, input.imag], dim=2).permute(0, 2, 1, 3)
+        else:
+            feature = torch.stack([input.real, input.imag], dim=1)
+
+        masks = self.dc_crn(feature)
+        masks = [new_complex_like(input, m.unbind(dim=1)) for m in masks.unbind(dim=2)]
+
+        if self.mode == "masking":
+            if is_multichannel:
+                masked = [input * m.unsqueeze(2) for m in masks]
+            else:
+                masked = [input * m for m in masks]
+        else:
+            masked = masks
+            if is_multichannel:
+                masks = [m.unsqueeze(2) / (input + EPS) for m in masked]
+            else:
+                masks = [m / (input + EPS) for m in masked]
+
+        others = OrderedDict(
+            zip(["mask_spk{}".format(i + 1) for i in range(len(masks))], masks)
+        )
+
+        return masked, ilens, others
+
+    @property
+    def num_spk(self):
+        return self._num_spk
diff --git a/espnet2/enh/separator/dpcl_e2e_separator.py b/espnet2/enh/separator/dpcl_e2e_separator.py
new file mode 100644
index 00000000000..37936f9ad74
--- /dev/null
+++ b/espnet2/enh/separator/dpcl_e2e_separator.py
@@ -0,0 +1,183 @@
+from collections import OrderedDict
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+from torch_complex.tensor import ComplexTensor
+
+from espnet.nets.pytorch_backend.rnn.encoders import RNN
+from espnet2.enh.separator.abs_separator import AbsSeparator
+
+
+class DPCLE2ESeparator(AbsSeparator):
+    def __init__(
+        self,
+        input_dim: int,
+        rnn_type: str = "blstm",
+        num_spk: int = 2,
+        nonlinear: str = "tanh",
+        layer: int = 2,
+        unit: int = 512,
+        emb_D: int = 40,
+        dropout: float = 0.0,
+        alpha: float = 5.0,
+        max_iteration: int = 500,
+    ):
+        """Deep Clustering End-to-End Separator
+
+        References:
+            Single-Channel Multi-Speaker Separation using Deep Clustering;
+            Yusuf Isik. et al., 2016;
+            https://www.isca-speech.org/archive/interspeech_2016/isik16_interspeech.html
+
+        Args:
+            input_dim: input feature dimension
+            rnn_type: string, select from 'blstm', 'lstm' etc.
+            bidirectional: bool, whether the inter-chunk RNN layers are bidirectional.
+            num_spk: number of speakers
+            nonlinear: the nonlinear function for mask estimation,
+                       select from 'relu', 'tanh', 'sigmoid'
+            layer: int, number of stacked RNN layers. Default is 3.
+            unit: int, dimension of the hidden state.
+            emb_D: int, dimension of the feature vector for a tf-bin.
+            dropout: float, dropout ratio. Default is 0.
+            alpha: float, the clustering hardness parameter.
+            max_iteration: int, the max iterations of soft kmeans.
+        """
+        super().__init__()
+
+        self._num_spk = num_spk
+
+        self.blstm = RNN(
+            idim=input_dim,
+            elayers=layer,
+            cdim=unit,
+            hdim=unit,
+            dropout=dropout,
+            typ=rnn_type,
+        )
+
+        self.linear = torch.nn.Linear(unit, input_dim * emb_D)
+
+        if nonlinear not in ("sigmoid", "relu", "tanh"):
+            raise ValueError("Not supporting nonlinear={}".format(nonlinear))
+
+        self.nonlinear = {
+            "sigmoid": torch.nn.Sigmoid(),
+            "relu": torch.nn.ReLU(),
+            "tanh": torch.nn.Tanh(),
+        }[nonlinear]
+
+        self.enh_blstm = RNN(
+            idim=input_dim * (num_spk + 1),
+            elayers=1,
+            cdim=unit,
+            hdim=unit,
+            dropout=dropout,
+            typ=rnn_type,
+        )
+
+        self.enh_linear = torch.nn.Linear(unit, input_dim * num_spk)
+
+        self.D = emb_D
+        self.alpha = alpha
+        self.max_iteration = max_iteration
+
+    def forward(
+        self,
+        input: Union[torch.Tensor, ComplexTensor],
+        ilens: torch.Tensor,
+        additional: Optional[Dict] = None,
+    ) -> Tuple[List[Union[torch.Tensor, ComplexTensor]], torch.Tensor, OrderedDict]:
+        """Forward.
+
+        Args:
+            input (torch.Tensor or ComplexTensor): Encoded feature [B, T, F]
+            ilens (torch.Tensor): input lengths [Batch]
+
+        Returns:
+            masked (List[Union(torch.Tensor, ComplexTensor)]): [(B, T, N), ...]
+            ilens (torch.Tensor): (B,)
+            others predicted data, e.g. V: OrderedDict[
+                others predicted data, e.g. masks: OrderedDict[
+                'mask_spk1': torch.Tensor(Batch, Frames, Freq),
+                'mask_spk2': torch.Tensor(Batch, Frames, Freq),
+                ...
+                'mask_spkn': torch.Tensor(Batch, Frames, Freq),
+            ]
+        """
+
+        # if complex spectrum,
+        if isinstance(input, ComplexTensor):
+            feature = abs(input)
+        else:
+            feature = input
+        B, T, F = input.shape
+
+        # 1st Stage
+        # x:(B, T, F)
+        x, ilens, _ = self.blstm(feature, ilens)
+        # x:(B, T, F*D)
+        x = self.linear(x)
+        # x:(B, T, F*D)
+        x = self.nonlinear(x)
+        V = x.view(B, -1, self.D)
+
+        # Soft KMeans
+        centers = V[:, : self._num_spk, :]
+        gamma = torch.zeros(B, T * F, self._num_spk, device=input.device)
+        count = 0
+        while True:
+            # Compute weight
+            gamma_exp = torch.empty(B, T * F, self._num_spk, device=input.device)
+            new_centers = torch.empty(B, self._num_spk, self.D, device=input.device)
+            for i in range(self._num_spk):
+                gamma_exp[:, :, i] = torch.exp(
+                    -self.alpha
+                    * torch.sum(V - centers[:, i, :].unsqueeze(1) ** 2, dim=2)
+                )
+            # To avoid grad becomes nan, we add a small constant in denominator
+            gamma = gamma_exp / (torch.sum(gamma_exp, dim=2, keepdim=True) + 1.0e-8)
+            # Update centers
+            for i in range(self._num_spk):
+                new_centers[:, i, :] = torch.sum(
+                    V * gamma[:, :, i].unsqueeze(2), dim=1
+                ) / (torch.sum(gamma[:, :, i].unsqueeze(2), dim=1) + 1.0e-8)
+
+            if (
+                torch.pow(new_centers - centers, 2).sum() < 1.0e-5
+                or count > self.max_iteration
+            ):
+                break
+
+            count += 1
+            centers = new_centers
+
+        masks = gamma.contiguous().view(B, T, F, self._num_spk).unbind(dim=3)
+        masked = [feature * m for m in masks]
+        masked.append(feature)
+
+        # 2nd Stage
+        # cat_source:(B, T, (spks+1)*F)
+        cat_source = torch.cat(masked, dim=2)
+        # cat_x:(B, T, spks*F)
+        cat_x, ilens, _ = self.enh_blstm(cat_source, ilens)
+        # z:(B, T, spks*F)
+        z = self.enh_linear(cat_x)
+        z = z.contiguous().view(B, T, F, self._num_spk)
+
+        enh_masks = torch.softmax(z, dim=3).unbind(dim=3)
+        enh_masked = [input * m for m in enh_masks]
+
+        others = OrderedDict(
+            zip(["mask_spk{}".format(i + 1) for i in range(len(enh_masks))], enh_masks)
+        )
+
+        return enh_masked, ilens, others
+
+    @property
+    def num_spk(self):
+        return self._num_spk
diff --git a/espnet2/enh/separator/fasnet_separator.py b/espnet2/enh/separator/fasnet_separator.py
new file mode 100644
index 00000000000..9a41fbf2269
--- /dev/null
+++ b/espnet2/enh/separator/fasnet_separator.py
@@ -0,0 +1,112 @@
+from collections import OrderedDict
+from distutils.version import LooseVersion
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+
+import torch
+
+from espnet2.enh.layers.fasnet import FaSNet_TAC
+from espnet2.enh.layers.ifasnet import iFaSNet
+from espnet2.enh.separator.abs_separator import AbsSeparator
+
+
+is_torch_1_9_plus = LooseVersion(torch.__version__) >= LooseVersion("1.9.0")
+
+
+class FaSNetSeparator(AbsSeparator):
+    def __init__(
+        self,
+        input_dim: int,
+        enc_dim: int,
+        feature_dim: int,
+        hidden_dim: int,
+        layer: int,
+        segment_size: int,
+        num_spk: int,
+        win_len: int,
+        context_len: int,
+        fasnet_type: str,
+        dropout: float = 0.0,
+        sr: int = 16000,
+    ):
+        """Filter-and-sum Network (FaSNet) Separator
+
+        Args:
+            input_dim: required by AbsSeparator. Not used in this model.
+            enc_dim: encoder dimension
+            feature_dim: feature dimension
+            hidden_dim: hidden dimension in DPRNN
+            layer: number of DPRNN blocks in iFaSNet
+            segment_size: dual-path segment size
+            num_spk: number of speakers
+            win_len: window length in millisecond
+            context_len: context length in millisecond
+            fasnet_type: 'fasnet' or 'ifasnet'.
+                Select from origin fasnet or Implicit fasnet
+            dropout: dropout rate. Default is 0.
+            sr: samplerate of input audio
+        """
+        super().__init__()
+
+        self._num_spk = num_spk
+
+        assert fasnet_type in ["fasnet", "ifasnet"], "only support fasnet and ifasnet"
+
+        FASNET = FaSNet_TAC if fasnet_type == "fasnet" else iFaSNet
+
+        self.fasnet = FASNET(
+            enc_dim=enc_dim,
+            feature_dim=feature_dim,
+            hidden_dim=hidden_dim,
+            layer=layer,
+            segment_size=segment_size,
+            nspk=num_spk,
+            win_len=win_len,
+            context_len=context_len,
+            sr=sr,
+            dropout=dropout,
+        )
+
+    def forward(
+        self,
+        input: torch.Tensor,
+        ilens: torch.Tensor,
+        additional: Optional[Dict] = None,
+    ) -> Tuple[List[torch.Tensor], torch.Tensor, OrderedDict]:
+        """Forward.
+
+        Args:
+            input (torch.Tensor): (Batch, samples, channels)
+            ilens (torch.Tensor): input lengths [Batch]
+
+        Returns:
+            separated (List[Union(torch.Tensor, ComplexTensor)]): [(B, T, N), ...]
+            ilens (torch.Tensor): (B,)
+            others predicted data, e.g. masks: OrderedDict[
+                'mask_spk1': torch.Tensor(Batch, Frames, Freq),
+                'mask_spk2': torch.Tensor(Batch, Frames, Freq),
+                ...
+                'mask_spkn': torch.Tensor(Batch, Frames, Freq),
+            ]
+        """
+
+        assert input.dim() == 3, "only support input shape: (Batch, samples, channels)"
+        # currently only support for fixed-array
+
+        input = input.permute(0, 2, 1)
+
+        none_mic = torch.zeros(1, dtype=input.dtype)
+
+        separated = self.fasnet(input, none_mic)
+
+        separated = list(separated.unbind(dim=1))
+
+        others = {}
+
+        return separated, ilens, others
+
+    @property
+    def num_spk(self):
+        return self._num_spk
diff --git a/espnet2/tasks/asr.py b/espnet2/tasks/asr.py
index 780aa905697..9ab3c9ca7fd 100644
--- a/espnet2/tasks/asr.py
+++ b/espnet2/tasks/asr.py
@@ -14,6 +14,7 @@
 
 from espnet2.asr.ctc import CTC
 from espnet2.asr.decoder.abs_decoder import AbsDecoder
+from espnet2.asr.decoder.mlm_decoder import MLMDecoder
 from espnet2.asr.decoder.rnn_decoder import RNNDecoder
 from espnet2.asr.decoder.transformer_decoder import (
     DynamicConvolution2DTransformerDecoder,  # noqa: H301
@@ -28,6 +29,8 @@
 from espnet2.asr.decoder.transformer_decoder import TransformerDecoder
 from espnet2.asr.encoder.abs_encoder import AbsEncoder
 from espnet2.asr.encoder.conformer_encoder import ConformerEncoder
+from espnet2.asr.encoder.longformer_encoder import LongformerEncoder
+
 from espnet2.asr.encoder.hubert_encoder import FairseqHubertEncoder
 from espnet2.asr.encoder.hubert_encoder import FairseqHubertPretrainEncoder
 from espnet2.asr.encoder.rnn_encoder import RNNEncoder
@@ -46,6 +49,7 @@
 from espnet2.asr.frontend.fused import FusedFrontends
 from espnet2.asr.frontend.s3prl import S3prlFrontend
 from espnet2.asr.frontend.windowing import SlidingWindow
+from espnet2.asr.maskctc_model import MaskCTCModel
 from espnet2.asr.postencoder.abs_postencoder import AbsPostEncoder
 from espnet2.asr.postencoder.hugging_face_transformers_postencoder import (
     HuggingFaceTransformersPostEncoder,  # noqa: H301
@@ -63,6 +67,7 @@
 from espnet2.tasks.abs_task import AbsTask
 from espnet2.text.phoneme_tokenizer import g2p_choices
 from espnet2.torch_utils.initialize import initialize
+from espnet2.train.abs_espnet_model import AbsESPnetModel
 from espnet2.train.class_choices import ClassChoices
 from espnet2.train.collate_fn import CommonCollateFn
 from espnet2.train.preprocessor import CommonPreprocessor
@@ -104,6 +109,15 @@
     default="utterance_mvn",
     optional=True,
 )
+model_choices = ClassChoices(
+    "model",
+    classes=dict(
+        espnet=ESPnetASRModel,
+        maskctc=MaskCTCModel,
+    ),
+    type_check=AbsESPnetModel,
+    default="espnet",
+)
 preencoder_choices = ClassChoices(
     name="preencoder",
     classes=dict(
@@ -126,6 +140,7 @@
         wav2vec2=FairSeqWav2Vec2Encoder,
         hubert=FairseqHubertEncoder,
         hubert_pretrain=FairseqHubertPretrainEncoder,
+        longformer=LongformerEncoder,
     ),
     type_check=AbsEncoder,
     default="rnn",
@@ -149,6 +164,7 @@
         dynamic_conv2d=DynamicConvolution2DTransformerDecoder,
         rnn=RNNDecoder,
         transducer=TransducerDecoder,
+        mlm=MLMDecoder,
     ),
     type_check=AbsDecoder,
     default="rnn",
@@ -167,6 +183,8 @@ class ASRTask(AbsTask):
         specaug_choices,
         # --normalize and --normalize_conf
         normalize_choices,
+        # --model and --model_conf
+        model_choices,
         # --preencoder and --preencoder_conf
         preencoder_choices,
         # --encoder and --encoder_conf
@@ -229,12 +247,6 @@ def add_task_arguments(cls, parser: argparse.ArgumentParser):
             default=None,
             help="The keyword arguments for joint network class.",
         )
-        group.add_argument(
-            "--model_conf",
-            action=NestedDictAction,
-            default=get_default_kwargs(ESPnetASRModel),
-            help="The keyword arguments for model class.",
-        )
 
         group = parser.add_argument_group(description="Preprocess related")
         group.add_argument(
@@ -480,8 +492,12 @@ def build_model(cls, args: argparse.Namespace) -> ESPnetASRModel:
             odim=vocab_size, encoder_output_size=encoder_output_size, **args.ctc_conf
         )
 
-        # 8. Build model
-        model = ESPnetASRModel(
+        # 7. Build model
+        try:
+            model_class = model_choices.get_class(args.model)
+        except AttributeError:
+            model_class = model_choices.get_class("espnet")
+        model = model_class(
             vocab_size=vocab_size,
             frontend=frontend,
             specaug=specaug,
@@ -497,7 +513,7 @@ def build_model(cls, args: argparse.Namespace) -> ESPnetASRModel:
         )
 
         # FIXME(kamo): Should be done in model?
-        # 9. Initialize
+        # 8. Initialize
         if args.init is not None:
             initialize(model, args.init)
 
diff --git a/espnet2/tasks/enh.py b/espnet2/tasks/enh.py
index e11f75c64c6..f4048f621b5 100644
--- a/espnet2/tasks/enh.py
+++ b/espnet2/tasks/enh.py
@@ -35,9 +35,12 @@
 from espnet2.enh.separator.asteroid_models import AsteroidModel_Converter
 from espnet2.enh.separator.conformer_separator import ConformerSeparator
 from espnet2.enh.separator.dan_separator import DANSeparator
+from espnet2.enh.separator.dc_crn_separator import DC_CRNSeparator
 from espnet2.enh.separator.dccrn_separator import DCCRNSeparator
 from espnet2.enh.separator.dpcl_separator import DPCLSeparator
+from espnet2.enh.separator.dpcl_e2e_separator import DPCLE2ESeparator
 from espnet2.enh.separator.dprnn_separator import DPRNNSeparator
+from espnet2.enh.separator.fasnet_separator import FaSNetSeparator
 from espnet2.enh.separator.neural_beamformer import NeuralBeamformer
 from espnet2.enh.separator.rnn_separator import RNNSeparator
 from espnet2.enh.separator.skim_separator import SkiMSeparator
@@ -66,14 +69,17 @@
         rnn=RNNSeparator,
         skim=SkiMSeparator,
         tcn=TCNSeparator,
+        dc_crn=DC_CRNSeparator,
         dprnn=DPRNNSeparator,
         dccrn=DCCRNSeparator,
         transformer=TransformerSeparator,
         conformer=ConformerSeparator,
         dpcl=DPCLSeparator,
+        dpcl_e2e=DPCLE2ESeparator,
         dan=DANSeparator,
         wpe_beamformer=NeuralBeamformer,
         asteroid=AsteroidModel_Converter,
+        fasnet=FaSNetSeparator,
     ),
     type_check=AbsSeparator,
     default="rnn",
diff --git a/test/espnet2/asr/decoder/test_mlm_decoder.py b/test/espnet2/asr/decoder/test_mlm_decoder.py
new file mode 100644
index 00000000000..97887611abb
--- /dev/null
+++ b/test/espnet2/asr/decoder/test_mlm_decoder.py
@@ -0,0 +1,34 @@
+import pytest
+import torch
+
+from espnet2.asr.decoder.mlm_decoder import MLMDecoder
+
+
+@pytest.mark.parametrize("input_layer", ["linear", "embed"])
+@pytest.mark.parametrize("normalize_before", [True, False])
+@pytest.mark.parametrize("use_output_layer", [True, False])
+def test_MLMDecoder_backward(input_layer, normalize_before, use_output_layer):
+    vocab_size = 10
+    decoder = MLMDecoder(
+        vocab_size,
+        12,
+        linear_units=10,
+        num_blocks=2,
+        input_layer=input_layer,
+        normalize_before=normalize_before,
+        use_output_layer=use_output_layer,
+    )
+    x = torch.randn(2, 9, 12)
+    x_lens = torch.tensor([9, 7], dtype=torch.long)
+    if input_layer == "embed":
+        t = torch.randint(0, vocab_size + 1, [2, 4], dtype=torch.long)
+    else:
+        t = torch.randn(2, 4, vocab_size + 1)
+    t_lens = torch.tensor([4, 3], dtype=torch.long)
+    z_all, ys_in_lens = decoder(x, x_lens, t, t_lens)
+    z_all.sum().backward()
+
+
+def test_MLMDecoder_invalid_type():
+    with pytest.raises(ValueError):
+        MLMDecoder(10, 12, input_layer="foo")
diff --git a/test/espnet2/asr/encoder/test_conformer_encoder.py b/test/espnet2/asr/encoder/test_conformer_encoder.py
index ddc2d077f9d..9acb0cd1b8a 100644
--- a/test/espnet2/asr/encoder/test_conformer_encoder.py
+++ b/test/espnet2/asr/encoder/test_conformer_encoder.py
@@ -26,6 +26,7 @@
         ([1], True),
     ],
 )
+@pytest.mark.parametrize("stochastic_depth_rate", [0.0, 0.1, [0.1, 0.1]])
 def test_encoder_forward_backward(
     input_layer,
     positionwise_layer_type,
@@ -34,6 +35,7 @@ def test_encoder_forward_backward(
     selfattention_layer_type,
     interctc_layer_idx,
     interctc_use_conditioning,
+    stochastic_depth_rate,
 ):
     encoder = ConformerEncoder(
         20,
@@ -52,6 +54,7 @@ def test_encoder_forward_backward(
         positionwise_layer_type=positionwise_layer_type,
         interctc_layer_idx=interctc_layer_idx,
         interctc_use_conditioning=interctc_use_conditioning,
+        stochastic_depth_rate=stochastic_depth_rate,
     )
     if input_layer == "embed":
         x = torch.randint(0, 10, [2, 32])
@@ -128,3 +131,18 @@ def test_encoder_output_size():
 def test_encoder_invalid_type():
     with pytest.raises(ValueError):
         ConformerEncoder(20, input_layer="fff")
+
+
+def test_encoder_invalid_stochastic_depth_rate():
+    with pytest.raises(ValueError):
+        ConformerEncoder(
+            20,
+            num_blocks=2,
+            stochastic_depth_rate=[0.1],
+        )
+    with pytest.raises(ValueError):
+        ConformerEncoder(
+            20,
+            num_blocks=2,
+            stochastic_depth_rate=[0.1, 0.1, 0.1],
+        )
diff --git a/test/espnet2/asr/encoder/test_longformer_encoder.py b/test/espnet2/asr/encoder/test_longformer_encoder.py
new file mode 100644
index 00000000000..8df5f5fc212
--- /dev/null
+++ b/test/espnet2/asr/encoder/test_longformer_encoder.py
@@ -0,0 +1,83 @@
+from espnet2.asr.encoder.longformer_encoder import LongformerEncoder
+import pytest
+import torch
+
+pytest.importorskip("longformer")
+
+
+@pytest.mark.parametrize(
+    "input_layer", ["linear", "conv2d", "conv2d2", "conv2d6", "conv2d8", "embed"]
+)
+@pytest.mark.parametrize("positionwise_layer_type", ["conv1d", "conv1d-linear"])
+@pytest.mark.parametrize(
+    "rel_pos_type, pos_enc_layer_type, selfattention_layer_type",
+    [
+        ("legacy", "abs_pos", "lf_selfattn"),
+    ],
+)
+def test_encoder_forward_backward(
+    input_layer,
+    positionwise_layer_type,
+    rel_pos_type,
+    pos_enc_layer_type,
+    selfattention_layer_type,
+):
+    pytest.importorskip("longformer")
+    encoder = LongformerEncoder(
+        20,
+        output_size=2,
+        attention_heads=2,
+        linear_units=4,
+        num_blocks=2,
+        input_layer=input_layer,
+        macaron_style=False,
+        rel_pos_type=rel_pos_type,
+        pos_enc_layer_type=pos_enc_layer_type,
+        selfattention_layer_type=selfattention_layer_type,
+        activation_type="swish",
+        use_cnn_module=True,
+        cnn_module_kernel=3,
+        positionwise_layer_type=positionwise_layer_type,
+        attention_windows=[10, 10],
+        attention_dilation=[1, 1],
+        attention_mode="sliding_chunks",
+    )
+    if input_layer == "embed":
+        x = torch.randint(0, 10, [2, 32])
+    else:
+        x = torch.randn(2, 32, 20, requires_grad=True)
+    x_lens = torch.LongTensor([32, 28])
+    y, _, _ = encoder(x, x_lens)
+    y.sum().backward()
+
+
+def test_encoder_invalid_layer_type():
+    pytest.importorskip("longformer")
+    with pytest.raises(ValueError):
+        LongformerEncoder(20, pos_enc_layer_type="abc_pos")
+    with pytest.raises(ValueError):
+        LongformerEncoder(20, pos_enc_layer_type="dummy")
+    with pytest.raises(ValueError):
+        LongformerEncoder(
+            20, pos_enc_layer_type="abc_pos", selfattention_layer_type="dummy"
+        )
+
+
+def test_encoder_invalid_windows_parameter():
+    pytest.importorskip("longformer")
+    with pytest.raises(ValueError):
+        LongformerEncoder(20, attention_windows=[1, 1], num_blocks=4)
+    with pytest.raises(ValueError):
+        LongformerEncoder(20, attention_dilation=[1, 1], num_blocks=4)
+
+
+def test_encoder_output_size():
+    pytest.importorskip("longformer")
+    encoder = LongformerEncoder(20, output_size=256)
+    assert encoder.output_size() == 256
+
+
+def test_encoder_invalid_type():
+    pytest.importorskip("longformer")
+    with pytest.raises(ValueError):
+        LongformerEncoder(20, input_layer="fff")
diff --git a/test/espnet2/asr/test_maskctc_model.py b/test/espnet2/asr/test_maskctc_model.py
new file mode 100644
index 00000000000..4631f9be539
--- /dev/null
+++ b/test/espnet2/asr/test_maskctc_model.py
@@ -0,0 +1,77 @@
+import pytest
+import torch
+
+from espnet2.asr.ctc import CTC
+from espnet2.asr.decoder.mlm_decoder import MLMDecoder
+from espnet2.asr.encoder.conformer_encoder import ConformerEncoder
+from espnet2.asr.encoder.transformer_encoder import TransformerEncoder
+from espnet2.asr.maskctc_model import MaskCTCInference
+from espnet2.asr.maskctc_model import MaskCTCModel
+
+
+@pytest.mark.parametrize("encoder_arch", [TransformerEncoder, ConformerEncoder])
+@pytest.mark.parametrize(
+    "interctc_layer_idx, interctc_use_conditioning, interctc_weight",
+    [
+        ([], False, 0.0),
+        ([1], True, 0.5),
+    ],
+)
+def test_maskctc(
+    encoder_arch, interctc_layer_idx, interctc_use_conditioning, interctc_weight
+):
+    vocab_size = 5
+    enc_out = 4
+    encoder = encoder_arch(
+        20,
+        output_size=enc_out,
+        linear_units=4,
+        num_blocks=2,
+        interctc_layer_idx=interctc_layer_idx,
+        interctc_use_conditioning=interctc_use_conditioning,
+    )
+    decoder = MLMDecoder(
+        vocab_size,
+        enc_out,
+        linear_units=4,
+        num_blocks=2,
+    )
+    ctc = CTC(odim=vocab_size, encoder_output_size=enc_out)
+
+    model = MaskCTCModel(
+        vocab_size,
+        token_list=["<blank>", "<unk>", "a", "i", "<eos>"],
+        frontend=None,
+        specaug=None,
+        normalize=None,
+        preencoder=None,
+        encoder=encoder,
+        postencoder=None,
+        decoder=decoder,
+        ctc=ctc,
+        interctc_weight=interctc_weight,
+    )
+
+    inputs = dict(
+        speech=torch.randn(2, 10, 20, requires_grad=True),
+        speech_lengths=torch.tensor([10, 8], dtype=torch.long),
+        text=torch.randint(2, 4, [2, 4], dtype=torch.long),
+        text_lengths=torch.tensor([4, 3], dtype=torch.long),
+    )
+    loss, *_ = model(**inputs)
+    loss.backward()
+
+    with torch.no_grad():
+        model.eval()
+
+        s2t = MaskCTCInference(
+            asr_model=model,
+            n_iterations=2,
+            threshold_probability=0.5,
+        )
+
+        # free running
+        inputs = dict(
+            enc_out=torch.randn(2, 4),
+        )
+        s2t(**inputs)
diff --git a/test/espnet2/bin/test_asr_inference_maskctc.py b/test/espnet2/bin/test_asr_inference_maskctc.py
new file mode 100644
index 00000000000..21a1d0392b4
--- /dev/null
+++ b/test/espnet2/bin/test_asr_inference_maskctc.py
@@ -0,0 +1,68 @@
+from argparse import ArgumentParser
+from pathlib import Path
+import string
+
+import numpy as np
+import pytest
+
+from espnet.nets.beam_search import Hypothesis
+from espnet2.bin.asr_inference_maskctc import get_parser
+from espnet2.bin.asr_inference_maskctc import main
+from espnet2.bin.asr_inference_maskctc import Speech2Text
+from espnet2.tasks.asr import ASRTask
+
+
+def test_get_parser():
+    assert isinstance(get_parser(), ArgumentParser)
+
+
+def test_main():
+    with pytest.raises(SystemExit):
+        main()
+
+
+@pytest.fixture()
+def token_list(tmp_path: Path):
+    with (tmp_path / "tokens.txt").open("w") as f:
+        f.write("<blank>\n")
+        for c in string.ascii_letters:
+            f.write(f"{c}\n")
+        f.write("<unk>\n")
+        f.write("<sos/eos>\n")
+    return tmp_path / "tokens.txt"
+
+
+@pytest.fixture()
+def asr_config_file(tmp_path: Path, token_list):
+    # Write default configuration file
+    ASRTask.main(
+        cmd=[
+            "--dry_run",
+            "true",
+            "--output_dir",
+            str(tmp_path / "asr"),
+            "--token_list",
+            str(token_list),
+            "--token_type",
+            "char",
+            "--model",
+            "maskctc",
+            "--encoder",
+            "transformer",
+            "--decoder",
+            "mlm",
+        ]
+    )
+    return tmp_path / "asr" / "config.yaml"
+
+
+@pytest.mark.execution_timeout(5)
+def test_Speech2Text(asr_config_file):
+    speech2text = Speech2Text(asr_train_config=asr_config_file)
+    speech = np.random.randn(100000)
+    results = speech2text(speech)
+    for text, token, token_int, hyp in results:
+        assert isinstance(text, str)
+        assert isinstance(token[0], str)
+        assert isinstance(token_int[0], int)
+        assert isinstance(hyp, Hypothesis)
diff --git a/test/espnet2/enh/layers/test_conv_utils.py b/test/espnet2/enh/layers/test_conv_utils.py
new file mode 100644
index 00000000000..7e7ea22672c
--- /dev/null
+++ b/test/espnet2/enh/layers/test_conv_utils.py
@@ -0,0 +1,63 @@
+import pytest
+import torch
+
+from espnet2.enh.layers.conv_utils import conv2d_output_shape
+from espnet2.enh.layers.conv_utils import convtransp2d_output_shape
+
+
+@pytest.mark.parametrize("input_dim", [(10, 17), (10, 33)])
+@pytest.mark.parametrize("kernel_size", [(1, 3), (3, 5)])
+@pytest.mark.parametrize("stride", [(1, 1), (1, 2)])
+@pytest.mark.parametrize("padding", [(0, 0), (0, 1)])
+@pytest.mark.parametrize("dilation", [(1, 1), (1, 2)])
+def test_conv2d_output_shape(input_dim, kernel_size, stride, padding, dilation):
+    h, w = conv2d_output_shape(
+        input_dim,
+        kernel_size=kernel_size,
+        stride=stride,
+        pad=padding,
+        dilation=dilation,
+    )
+    conv = torch.nn.Conv2d(
+        1, 1, kernel_size, stride=stride, padding=padding, dilation=dilation
+    )
+    x = torch.rand(1, 1, *input_dim)
+    assert conv(x).shape[2:] == (h, w)
+
+
+@pytest.mark.parametrize("input_dim", [(10, 17), (10, 33)])
+@pytest.mark.parametrize("kernel_size", [(1, 3), (3, 5)])
+@pytest.mark.parametrize("stride", [(1, 1), (1, 2)])
+@pytest.mark.parametrize("padding", [(0, 0), (0, 1)])
+@pytest.mark.parametrize("output_padding", [(0, 0), (0, 1)])
+@pytest.mark.parametrize("dilation", [(1, 1), (1, 2)])
+def test_deconv2d_output_shape(
+    input_dim, kernel_size, stride, padding, output_padding, dilation
+):
+    if (
+        output_padding[0] >= stride[0]
+        or output_padding[0] >= dilation[0]
+        or output_padding[1] >= stride[1]
+        or output_padding[1] >= dilation[1]
+    ):
+        # skip invalid cases
+        return
+    h, w = convtransp2d_output_shape(
+        input_dim,
+        kernel_size=kernel_size,
+        stride=stride,
+        pad=padding,
+        dilation=dilation,
+        out_pad=output_padding,
+    )
+    deconv = torch.nn.ConvTranspose2d(
+        1,
+        1,
+        kernel_size,
+        stride=stride,
+        padding=padding,
+        output_padding=output_padding,
+        dilation=dilation,
+    )
+    x = torch.rand(1, 1, *input_dim)
+    assert deconv(x).shape[2:] == (h, w)
diff --git a/test/espnet2/enh/loss/criterions/test_tf_domain.py b/test/espnet2/enh/loss/criterions/test_tf_domain.py
index a82cd2aed70..75e13037217 100644
--- a/test/espnet2/enh/loss/criterions/test_tf_domain.py
+++ b/test/espnet2/enh/loss/criterions/test_tf_domain.py
@@ -8,7 +8,9 @@
 
 
 @pytest.mark.parametrize("criterion_class", [FrequencyDomainL1, FrequencyDomainMSE])
-@pytest.mark.parametrize("mask_type", ["IBM", "IRM", "IAM", "PSM", "NPSM", "PSM^2"])
+@pytest.mark.parametrize(
+    "mask_type", ["IBM", "IRM", "IAM", "PSM", "NPSM", "PSM^2", "CIRM"]
+)
 @pytest.mark.parametrize("compute_on_mask", [True, False])
 def test_tf_domain_criterion_forward(criterion_class, mask_type, compute_on_mask):
 
diff --git a/test/espnet2/enh/separator/test_dc_crn_separator.py b/test/espnet2/enh/separator/test_dc_crn_separator.py
new file mode 100644
index 00000000000..712de05e063
--- /dev/null
+++ b/test/espnet2/enh/separator/test_dc_crn_separator.py
@@ -0,0 +1,164 @@
+from distutils.version import LooseVersion
+import pytest
+
+import torch
+from torch_complex import ComplexTensor
+
+from espnet2.enh.layers.complex_utils import is_complex
+from espnet2.enh.separator.dc_crn_separator import DC_CRNSeparator
+
+
+is_torch_1_9_plus = LooseVersion(torch.__version__) >= LooseVersion("1.9.0")
+
+
+@pytest.mark.parametrize("input_dim", [33, 65])
+@pytest.mark.parametrize("num_spk", [1, 2])
+@pytest.mark.parametrize("input_channels", [[2, 4], [2, 4, 4]])
+@pytest.mark.parametrize("enc_hid_channels", [2, 5])
+@pytest.mark.parametrize("enc_layers", [2])
+@pytest.mark.parametrize("glstm_groups", [2])
+@pytest.mark.parametrize("glstm_layers", [1, 2])
+@pytest.mark.parametrize("glstm_bidirectional", [True, False])
+@pytest.mark.parametrize("glstm_rearrange", [True, False])
+@pytest.mark.parametrize("mode", ["mapping", "masking"])
+def test_dc_crn_separator_forward_backward_complex(
+    input_dim,
+    num_spk,
+    input_channels,
+    enc_hid_channels,
+    enc_layers,
+    glstm_groups,
+    glstm_layers,
+    glstm_bidirectional,
+    glstm_rearrange,
+    mode,
+):
+    model = DC_CRNSeparator(
+        input_dim=input_dim,
+        num_spk=num_spk,
+        input_channels=input_channels,
+        enc_hid_channels=enc_hid_channels,
+        enc_kernel_size=(1, 3),
+        enc_padding=(0, 1),
+        enc_last_kernel_size=(1, 3),
+        enc_last_stride=(1, 2),
+        enc_last_padding=(0, 1),
+        enc_layers=enc_layers,
+        skip_last_kernel_size=(1, 3),
+        skip_last_stride=(1, 1),
+        skip_last_padding=(0, 1),
+        glstm_groups=glstm_groups,
+        glstm_layers=glstm_layers,
+        glstm_bidirectional=glstm_bidirectional,
+        glstm_rearrange=glstm_rearrange,
+        mode=mode,
+    )
+    model.train()
+
+    real = torch.rand(2, 10, input_dim)
+    imag = torch.rand(2, 10, input_dim)
+    x = torch.complex(real, imag) if is_torch_1_9_plus else ComplexTensor(real, imag)
+    x_lens = torch.tensor([10, 8], dtype=torch.long)
+
+    masked, flens, others = model(x, ilens=x_lens)
+
+    assert is_complex(masked[0])
+    assert len(masked) == num_spk
+
+    masked[0].abs().mean().backward()
+
+
+@pytest.mark.parametrize("num_spk", [1, 2])
+@pytest.mark.parametrize("input_channels", [[4, 4], [6, 4, 4]])
+@pytest.mark.parametrize(
+    "enc_kernel_size, enc_padding", [((1, 3), (0, 1)), ((1, 5), (0, 2))]
+)
+@pytest.mark.parametrize("enc_last_stride", [(1, 2)])
+@pytest.mark.parametrize(
+    "enc_last_kernel_size, enc_last_padding",
+    [((1, 4), (0, 1)), ((1, 5), (0, 2))],
+)
+@pytest.mark.parametrize("skip_last_stride", [(1, 1)])
+@pytest.mark.parametrize(
+    "skip_last_kernel_size, skip_last_padding",
+    [((1, 3), (0, 1)), ((1, 5), (0, 2))],
+)
+def test_dc_crn_separator_multich_input(
+    num_spk,
+    input_channels,
+    enc_kernel_size,
+    enc_padding,
+    enc_last_kernel_size,
+    enc_last_stride,
+    enc_last_padding,
+    skip_last_kernel_size,
+    skip_last_stride,
+    skip_last_padding,
+):
+    model = DC_CRNSeparator(
+        input_dim=33,
+        num_spk=num_spk,
+        input_channels=input_channels,
+        enc_hid_channels=2,
+        enc_kernel_size=enc_kernel_size,
+        enc_padding=enc_padding,
+        enc_last_kernel_size=enc_last_kernel_size,
+        enc_last_stride=enc_last_stride,
+        enc_last_padding=enc_last_padding,
+        enc_layers=3,
+        skip_last_kernel_size=skip_last_kernel_size,
+        skip_last_stride=skip_last_stride,
+        skip_last_padding=skip_last_padding,
+    )
+    model.train()
+
+    real = torch.rand(2, 10, input_channels[0] // 2, 33)
+    imag = torch.rand(2, 10, input_channels[0] // 2, 33)
+    x = torch.complex(real, imag) if is_torch_1_9_plus else ComplexTensor(real, imag)
+    x_lens = torch.tensor([10, 8], dtype=torch.long)
+
+    masked, flens, others = model(x, ilens=x_lens)
+
+    assert is_complex(masked[0])
+    assert len(masked) == num_spk
+
+    masked[0].abs().mean().backward()
+
+
+def test_dc_crn_separator_invalid_enc_layer():
+    with pytest.raises(AssertionError):
+        DC_CRNSeparator(
+            input_dim=17,
+            input_channels=[2, 2, 4],
+            enc_layers=1,
+        )
+
+
+def test_dc_crn_separator_invalid_type():
+    with pytest.raises(ValueError):
+        DC_CRNSeparator(
+            input_dim=17,
+            input_channels=[2, 2, 4],
+            mode="xxx",
+        )
+
+
+def test_dc_crn_separator_output():
+    real = torch.rand(2, 10, 17)
+    imag = torch.rand(2, 10, 17)
+    x = torch.complex(real, imag) if is_torch_1_9_plus else ComplexTensor(real, imag)
+    x_lens = torch.tensor([10, 8], dtype=torch.long)
+
+    for num_spk in range(1, 3):
+        model = DC_CRNSeparator(
+            input_dim=17,
+            num_spk=num_spk,
+            input_channels=[2, 2, 4],
+        )
+        model.eval()
+        specs, _, others = model(x, x_lens)
+        assert isinstance(specs, list)
+        assert isinstance(others, dict)
+        for n in range(num_spk):
+            assert "mask_spk{}".format(n + 1) in others
+            assert specs[n].shape == others["mask_spk{}".format(n + 1)].shape
diff --git a/test/espnet2/enh/separator/test_dpcl_e2e_separator.py b/test/espnet2/enh/separator/test_dpcl_e2e_separator.py
new file mode 100644
index 00000000000..c470a9ee83f
--- /dev/null
+++ b/test/espnet2/enh/separator/test_dpcl_e2e_separator.py
@@ -0,0 +1,146 @@
+import pytest
+
+import torch
+from torch import Tensor
+from torch_complex import ComplexTensor
+
+from espnet2.enh.separator.dpcl_e2e_separator import DPCLE2ESeparator
+
+
+@pytest.mark.parametrize("input_dim", [5])
+@pytest.mark.parametrize("rnn_type", ["blstm"])
+@pytest.mark.parametrize("layer", [1, 3])
+@pytest.mark.parametrize("unit", [8])
+@pytest.mark.parametrize("dropout", [0.0, 0.2])
+@pytest.mark.parametrize("num_spk", [2])
+@pytest.mark.parametrize("emb_D", [40])
+@pytest.mark.parametrize("nonlinear", ["relu", "sigmoid", "tanh"])
+@pytest.mark.parametrize("alpha", [1.0, 5.0])
+@pytest.mark.parametrize("max_iteration", [100, 500])
+def test_dpcl_e2e_separator_forward_backward_complex(
+    input_dim,
+    rnn_type,
+    layer,
+    unit,
+    dropout,
+    num_spk,
+    emb_D,
+    nonlinear,
+    alpha,
+    max_iteration,
+):
+    model = DPCLE2ESeparator(
+        input_dim=input_dim,
+        rnn_type=rnn_type,
+        layer=layer,
+        unit=unit,
+        dropout=dropout,
+        num_spk=num_spk,
+        emb_D=emb_D,
+        nonlinear=nonlinear,
+        alpha=alpha,
+        max_iteration=max_iteration,
+    )
+    model.train()
+
+    real = torch.rand(2, 10, input_dim)
+    imag = torch.rand(2, 10, input_dim)
+    x = ComplexTensor(real, imag)
+    x_lens = torch.tensor([10, 8], dtype=torch.long)
+
+    masked, flens, others = model(x, ilens=x_lens)
+
+    assert isinstance(masked[0], ComplexTensor)
+    assert len(masked) == num_spk
+
+    masked[0].abs().mean().backward()
+
+
+@pytest.mark.parametrize("input_dim", [5])
+@pytest.mark.parametrize("rnn_type", ["blstm"])
+@pytest.mark.parametrize("layer", [1, 3])
+@pytest.mark.parametrize("unit", [8])
+@pytest.mark.parametrize("dropout", [0.0, 0.2])
+@pytest.mark.parametrize("num_spk", [2])
+@pytest.mark.parametrize("emb_D", [40])
+@pytest.mark.parametrize("nonlinear", ["relu", "sigmoid", "tanh"])
+@pytest.mark.parametrize("alpha", [1.0, 5.0])
+@pytest.mark.parametrize("max_iteration", [100, 500])
+def test_dpcl_e2e_separator_forward_backward_real(
+    input_dim,
+    rnn_type,
+    layer,
+    unit,
+    dropout,
+    num_spk,
+    emb_D,
+    nonlinear,
+    alpha,
+    max_iteration,
+):
+    model = DPCLE2ESeparator(
+        input_dim=input_dim,
+        rnn_type=rnn_type,
+        layer=layer,
+        unit=unit,
+        dropout=dropout,
+        num_spk=num_spk,
+        emb_D=emb_D,
+        nonlinear=nonlinear,
+        alpha=alpha,
+        max_iteration=max_iteration,
+    )
+    model.train()
+
+    x = torch.rand(2, 10, input_dim)
+    x_lens = torch.tensor([10, 8], dtype=torch.long)
+
+    masked, flens, others = model(x, ilens=x_lens)
+
+    assert isinstance(masked[0], Tensor)
+    assert len(masked) == num_spk
+
+    masked[0].abs().mean().backward()
+
+
+def test_dpcl_e2e_separator_invalid_type():
+    with pytest.raises(ValueError):
+        DPCLE2ESeparator(
+            input_dim=10,
+            rnn_type="rnn",
+            layer=2,
+            unit=10,
+            dropout=0.1,
+            num_spk=2,
+            emb_D=40,
+            nonlinear="fff",
+            alpha=5.0,
+            max_iteration=100,
+        )
+
+
+def test_dpcl_e2e_separator_output():
+
+    x = torch.rand(1, 10, 10)
+    x_lens = torch.tensor([10], dtype=torch.long)
+
+    for num_spk in range(1, 4):
+        model = DPCLE2ESeparator(
+            input_dim=10,
+            rnn_type="rnn",
+            layer=2,
+            unit=10,
+            dropout=0.1,
+            num_spk=num_spk,
+            emb_D=40,
+            nonlinear="relu",
+            alpha=5.0,
+            max_iteration=100,
+        )
+        model.eval()
+        specs, _, others = model(x, x_lens)
+        assert isinstance(specs, list)
+        assert isinstance(others, dict)
+        for n in range(num_spk):
+            assert "mask_spk{}".format(n + 1) in others
+            assert specs[n].shape == others["mask_spk{}".format(n + 1)].shape
diff --git a/test/espnet2/enh/separator/test_fasnet_separator.py b/test/espnet2/enh/separator/test_fasnet_separator.py
new file mode 100644
index 00000000000..603dc9ce680
--- /dev/null
+++ b/test/espnet2/enh/separator/test_fasnet_separator.py
@@ -0,0 +1,83 @@
+import pytest
+
+import torch
+from torch import Tensor
+
+from espnet2.enh.separator.fasnet_separator import FaSNetSeparator
+
+
+@pytest.mark.parametrize("input_dim", [1])
+@pytest.mark.parametrize("enc_dim", [4])
+@pytest.mark.parametrize("feature_dim", [4])
+@pytest.mark.parametrize("hidden_dim", [4])
+@pytest.mark.parametrize("segment_size", [2])
+@pytest.mark.parametrize("layer", [1, 2])
+@pytest.mark.parametrize("num_spk", [1, 2])
+@pytest.mark.parametrize("win_len", [2, 4])
+@pytest.mark.parametrize("context_len", [2, 4])
+@pytest.mark.parametrize("fasnet_type", ["fasnet", "ifasnet"])
+@pytest.mark.parametrize("sr", [100])
+def test_fasnet_separator_forward_backward_real(
+    input_dim,
+    enc_dim,
+    feature_dim,
+    hidden_dim,
+    segment_size,
+    layer,
+    num_spk,
+    win_len,
+    context_len,
+    fasnet_type,
+    sr,
+):
+    model = FaSNetSeparator(
+        input_dim=input_dim,
+        enc_dim=enc_dim,
+        feature_dim=feature_dim,
+        hidden_dim=hidden_dim,
+        segment_size=segment_size,
+        layer=layer,
+        num_spk=num_spk,
+        win_len=win_len,
+        context_len=context_len,
+        fasnet_type=fasnet_type,
+        sr=sr,
+    )
+    model.train()
+
+    x = torch.rand(2, 400, 4)
+    x_lens = torch.tensor([400, 300], dtype=torch.long)
+
+    separated, flens, others = model(x, ilens=x_lens)
+
+    assert isinstance(separated[0], Tensor)
+    assert len(separated) == num_spk
+
+    separated[0].abs().mean().backward()
+
+
+@pytest.mark.parametrize("fasnet_type", ["fasnet", "ifasnet"])
+def test_fasnet_separator_output(fasnet_type):
+
+    x = torch.rand(2, 800, 4)
+    x_lens = torch.tensor([10, 8], dtype=torch.long)
+
+    for num_spk in range(1, 3):
+        model = FaSNetSeparator(
+            input_dim=16,
+            enc_dim=16,
+            feature_dim=16,
+            hidden_dim=16,
+            segment_size=4,
+            layer=2,
+            num_spk=num_spk,
+            win_len=2,
+            context_len=2,
+            fasnet_type=fasnet_type,
+            sr=100,
+        )
+        model.eval()
+        specs, _, others = model(x, x_lens)
+        assert isinstance(specs, list)
+        assert isinstance(others, dict)
+        assert x[:, :, 0].shape == specs[0].shape
diff --git a/test/espnet2/enh/test_espnet_model.py b/test/espnet2/enh/test_espnet_model.py
index cb2a864900f..f3a29f1757d 100644
--- a/test/espnet2/enh/test_espnet_model.py
+++ b/test/espnet2/enh/test_espnet_model.py
@@ -13,6 +13,7 @@
 from espnet2.enh.loss.criterions.time_domain import SISNRLoss
 from espnet2.enh.loss.wrappers.fixed_order import FixedOrderSolver
 from espnet2.enh.loss.wrappers.pit_solver import PITSolver
+from espnet2.enh.separator.dc_crn_separator import DC_CRNSeparator
 from espnet2.enh.separator.dccrn_separator import DCCRNSeparator
 from espnet2.enh.separator.dprnn_separator import DPRNNSeparator
 from espnet2.enh.separator.neural_beamformer import NeuralBeamformer
@@ -25,43 +26,47 @@
 
 
 stft_encoder = STFTEncoder(
-    n_fft=16,
-    hop_length=8,
+    n_fft=32,
+    hop_length=16,
 )
 
 stft_encoder_bultin_complex = STFTEncoder(
-    n_fft=16,
-    hop_length=8,
+    n_fft=32,
+    hop_length=16,
     use_builtin_complex=True,
 )
 
 stft_decoder = STFTDecoder(
-    n_fft=16,
-    hop_length=8,
+    n_fft=32,
+    hop_length=16,
 )
 
 conv_encoder = ConvEncoder(
-    channel=9,
-    kernel_size=20,
-    stride=10,
+    channel=17,
+    kernel_size=36,
+    stride=18,
 )
 
 conv_decoder = ConvDecoder(
-    channel=9,
-    kernel_size=20,
-    stride=10,
+    channel=17,
+    kernel_size=36,
+    stride=18,
 )
 
 rnn_separator = RNNSeparator(
-    input_dim=9,
+    input_dim=17,
     layer=1,
     unit=10,
 )
 
-dprnn_separator = DPRNNSeparator(input_dim=9, layer=1, unit=10, segment_size=4)
+dc_crn_separator = DC_CRNSeparator(input_dim=17, input_channels=[2, 2, 4])
+
+dccrn_separator = DCCRNSeparator(input_dim=17, num_spk=1, kernel_num=[32, 64, 128])
+
+dprnn_separator = DPRNNSeparator(input_dim=17, layer=1, unit=10, segment_size=4)
 
 tcn_separator = TCNSeparator(
-    input_dim=9,
+    input_dim=17,
     layer=2,
     stack=1,
     bottleneck_dim=10,
@@ -70,31 +75,13 @@
 )
 
 transformer_separator = TransformerSeparator(
-    input_dim=9,
+    input_dim=17,
     adim=8,
     aheads=2,
     layers=2,
     linear_units=10,
 )
 
-dccrn_separator = DCCRNSeparator(
-    input_dim=9,
-    num_spk=1,
-    rnn_layer=2,
-    rnn_units=256,
-    masking_mode="E",
-    use_clstm=True,
-    bidirectional=False,
-    use_cbn=False,
-    kernel_size=5,
-    kernel_num=[
-        32,
-        64,
-        128,
-    ],
-    use_builtin_complex=True,
-    use_noise_mask=False,
-)
 si_snr_loss = SISNRLoss()
 tf_mse_loss = FrequencyDomainMSE()
 tf_l1_loss = FrequencyDomainL1()
@@ -116,6 +103,7 @@
     [
         rnn_separator,
         dprnn_separator,
+        dc_crn_separator,
         dccrn_separator,
         tcn_separator,
         transformer_separator,
@@ -124,8 +112,11 @@
 @pytest.mark.parametrize("training", [True, False])
 @pytest.mark.parametrize("loss_wrappers", [[pit_wrapper, fix_order_solver]])
 def test_single_channel_model(encoder, decoder, separator, training, loss_wrappers):
-    # DCCRN separator dose not support ConvEncoder and ConvDecoder
-    if isinstance(encoder, ConvEncoder) and isinstance(separator, DCCRNSeparator):
+    if not isinstance(encoder, STFTEncoder) and isinstance(
+        separator, (DCCRNSeparator, DC_CRNSeparator)
+    ):
+        # skip because DCCRNSeparator and DC_CRNSeparator only work
+        # for complex spectrum features
         return
     inputs = torch.randn(2, 300)
     ilens = torch.LongTensor([300, 200])
diff --git a/test_utils/test_scoreintent_py.bats b/test_utils/test_scoreintent_py.bats
old mode 100644
new mode 100755
index 67c60c3e154..3af9b0e09d5
--- a/test_utils/test_scoreintent_py.bats
+++ b/test_utils/test_scoreintent_py.bats
@@ -9,17 +9,17 @@ setup() {
     test_inference_folder=test/
     echo $tmpdir
     cat <<EOF > $tmpdir/valid/score_wer/hyp.trn
-decrease_heat_washroom Turn the temperature down in the bathroom        (7NqqnAOPVVSKnxyv-7NqqnAOPVVSKnxyv_01307c00-4630-11e9-bc65-55b32b211b66.wav)
-decrease_heat_washroom Turn the temperature down in the washroom        (7NqqnAOPVVSKnxyv-7NqqnAOPVVSKnxyv_0157abb0-4633-11e9-bc65-55b32b211b66.wav)
+decrease_heat_washroom Turn the temperature down in the bathroom	(7NqqnAOPVVSKnxyv-7NqqnAOPVVSKnxyv_01307c00-4630-11e9-bc65-55b32b211b66.wav)
+decrease_heat_washroom Turn the temperature down in the washroom	(7NqqnAOPVVSKnxyv-7NqqnAOPVVSKnxyv_0157abb0-4633-11e9-bc65-55b32b211b66.wav)
 EOF
     cp $tmpdir/valid/score_wer/hyp.trn $tmpdir/valid/score_wer/ref.trn
     cat <<EOF > $tmpdir/test/score_wer/hyp.trn
-activate_lights_washroom Lights on in the bathroom      (4BrX8aDqK2cLZRYl-4BrX8aDqK2cLZRYl_00143870-4531-11e9-b1e4-e5985dca719e.wav)
-increase_volume_none Increase the volume        (4BrX8aDqK2cLZRYl-4BrX8aDqK2cLZRYl_00224990-452e-11e9-b1e4-e5985dca719e.wav)
+activate_lights_washroom Lights on in the bathroom	(4BrX8aDqK2cLZRYl-4BrX8aDqK2cLZRYl_00143870-4531-11e9-b1e4-e5985dca719e.wav)
+increase_volume_none Increase the volume	(4BrX8aDqK2cLZRYl-4BrX8aDqK2cLZRYl_00224990-452e-11e9-b1e4-e5985dca719e.wav)
 EOF
     cat <<EOF > $tmpdir/test/score_wer/ref.trn
-activate_lights_none Lights on      (4BrX8aDqK2cLZRYl-4BrX8aDqK2cLZRYl_00143870-4531-11e9-b1e4-e5985dca719e.wav)
-increase_volume_none Increase the volume        (4BrX8aDqK2cLZRYl-4BrX8aDqK2cLZRYl_00224990-452e-11e9-b1e4-e5985dca719e.wav)
+activate_lights_none Lights on	(4BrX8aDqK2cLZRYl-4BrX8aDqK2cLZRYl_00143870-4531-11e9-b1e4-e5985dca719e.wav)
+increase_volume_none Increase the volume	(4BrX8aDqK2cLZRYl-4BrX8aDqK2cLZRYl_00224990-452e-11e9-b1e4-e5985dca719e.wav)
 EOF
     cat << EOF > $tmpdir/result.txt
 Valid Intent Classification Result
diff --git a/tools/Makefile b/tools/Makefile
index 744a58b1bf7..c8c41bbb524 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -30,10 +30,10 @@ all: kaldi showenv python conda_packages.done sctk.done sph2pipe.done check_inst
 
 ifneq ($(strip $(CHAINER_VERSION)),)
 python: activate_python.sh espnet.done pytorch.done chainer.done fairscale.done torch_optimizer.done
-extra: warp-ctc.done warp-transducer.done chainer_ctc.done nkf.done moses.done mwerSegmenter.done pesq kenlm.done pyopenjtalk.done py3mmseg.done beamformit.done fairseq.done s3prl.done k2.done transformers.done phonemizer.done
+extra: warp-ctc.done warp-transducer.done chainer_ctc.done nkf.done moses.done mwerSegmenter.done pesq kenlm.done pyopenjtalk.done py3mmseg.done beamformit.done fairseq.done s3prl.done k2.done transformers.done phonemizer.done longformer.done
 else
 python: activate_python.sh espnet.done pytorch.done fairscale.done torch_optimizer.done
-extra: warp-ctc.done warp-transducer.done nkf.done moses.done mwerSegmenter.done pesq kenlm.done pyopenjtalk.done py3mmseg.done beamformit.done fairseq.done s3prl.done k2.done transformers.done phonemizer.done
+extra: warp-ctc.done warp-transducer.done nkf.done moses.done mwerSegmenter.done pesq kenlm.done pyopenjtalk.done py3mmseg.done beamformit.done fairseq.done s3prl.done k2.done transformers.done phonemizer.done longformer.done
 endif
 
 
@@ -205,6 +205,10 @@ transformers.done: espnet.done
 	. ./activate_python.sh && ./installers/install_transformers.sh
 	touch transformers.done
 
+longformer.done: espnet.done
+	. ./activate_python.sh && ./installers/install_longformer.sh
+	touch longformer.done
+
 check_install: python
 	. ./activate_python.sh; . ./extra_path.sh; python3 check_install.py
 
diff --git a/tools/check_install.py b/tools/check_install.py
index c5e4e6aa877..82081986123 100644
--- a/tools/check_install.py
+++ b/tools/check_install.py
@@ -29,6 +29,9 @@
     ("transformers", None, "installers/install_transformers.sh"),
     ("speechbrain", None, "installers/install_speechbrain.sh"),
     ("k2", None, "installers/install_k2.sh"),
+    ("longformer",None,"installers/install_longformer.sh"),
+    ("nlg-eval",None,"installers/install_longformer.sh"),
+    ("datasets",None,"installers/install_longformer.sh"),
 ]
 
 executable_list = [
diff --git a/tools/installers/install_longformer.sh b/tools/installers/install_longformer.sh
new file mode 100755
index 00000000000..c942abb0dd9
--- /dev/null
+++ b/tools/installers/install_longformer.sh
@@ -0,0 +1,56 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+if [ $# != 0 ]; then
+    echo "Usage: $0"
+    exit 1;
+fi
+
+torch_version=$(python3 -c "import torch; print(torch.__version__)")
+python_36_plus=$(python3 <<EOF
+from distutils.version import LooseVersion as V
+import sys
+
+if V(sys.version) >= V("3.6"):
+    print("true")
+else:
+    print("false")
+EOF
+)
+pt_plus(){
+    python3 <<EOF
+import sys
+from distutils.version import LooseVersion as L
+if L('$torch_version') >= L('$1'):
+    print("true")
+else:
+    print("false")
+EOF
+}
+
+echo "[INFO] torch_version=${torch_version}"
+
+if ! "${python_36_plus}"; then
+    echo "[ERROR] python<3.6 is not supported"
+    exit 1
+else
+
+    if $(pt_plus 1.8.0); then
+        python -m pip install git+https://github.com/roshansh-cmu/longformer.git
+        python -m pip install datasets bert-score
+        python -m pip install git+https://github.com/Maluuba/nlg-eval.git@master
+    else
+        echo "[WARNING] Longformer requires pytorch>=1.8.*"
+    fi
+
+fi
+
+
+# Check the pytorch version is not changed from the original version
+current_torch_version="$(python3 -c 'import torch; print(torch.__version__)')"
+if [ ${torch_version} != "${current_torch_version}" ]; then
+    echo "[ERROR] The torch version has been changed. Please report to espnet administrators"
+    exit 1
+fi
+