diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index ac69ca49b32..bbd021e7afd 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -16,7 +16,7 @@ jobs:
       matrix:
         os: [ubuntu-18.04]
         python-version: [3.7]
-        pytorch-version: [1.3.1, 1.4.0, 1.5.1, 1.6.0, 1.7.1, 1.8.1, 1.9.1, 1.10.2, 1.11.0]
+        pytorch-version: [1.4.0, 1.5.1, 1.6.0, 1.7.1, 1.8.1, 1.9.1, 1.10.2, 1.11.0]
         chainer-version: [6.0.0]
         # NOTE(kamo): Conda is tested by Circle-CI
         use-conda: [false]
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
new file mode 100644
index 00000000000..049bdabafa1
--- /dev/null
+++ b/.github/workflows/docker.yml
@@ -0,0 +1,48 @@
+name: docker-builder
+
+on:
+  pull_request:
+    types: [closed]
+    branches:
+      - master
+    paths:
+      - 'tools/**'
+      - setup.py
+
+jobs:
+  docker:
+    runs-on: ubuntu-latest
+    if: github.event.pull_request.merged == true
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v1
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v1
+
+      - name: Login to DockerHub
+        uses: docker/login-action@v1 
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+      
+      - name: Build and push CPU container
+        run: |
+          cd docker
+          docker build --build-arg FROM_TAG=runtime-latest \
+            -f prebuilt/devel.dockerfile \
+            --target devel \
+            -t espnet/espnet:cpu-latest .
+          docker push espnet/espnet:cpu-latest   
+
+      - name: Build and push GPU container
+        run: |
+          cd docker
+          docker build --build-arg FROM_TAG=cuda-latest \
+            --build-arg CUDA_VER=11.1 \
+            -f prebuilt/devel.dockerfile \
+            --target devel \
+            -t espnet/espnet:gpu-latest .
+          docker push espnet/espnet:gpu-latest
diff --git a/.gitignore b/.gitignore
index 177ba14498d..7170a376705 100644
--- a/.gitignore
+++ b/.gitignore
@@ -42,6 +42,8 @@ egs*/*/*/tensorboard
 egs*/*/*/wav*
 egs*/*/*/nltk*
 egs*/*/*/.cache*
+egs*/*/*/pretrained_models*
+egs*/fisher_callhome_spanish/*/local/mapping*
 
 # tools related
 tools/chainer
diff --git a/.gitmodules b/.gitmodules
index bc771d8c6ee..e69de29bb2d 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +0,0 @@
-[submodule "doc/notebook"]
-	path = doc/notebook
-	url = https://github.com/espnet/notebook
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 3e62434c769..9036a09b66d 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -53,9 +53,9 @@ ESPnet2's recipes correspond to `egs2`. ESPnet2 applies a new paradigm without d
 For ESPnet2, we do not recommend preparing the recipe's stages for each corpus but using the common pipelines we provided in `asr.sh`, `tts.sh`, and
 `enh.sh`. For details of creating ESPnet2 recipes, please refer to [egs2-readme](https://github.com/espnet/espnet/blob/master/egs2/TEMPLATE/README.md).
 
-The common pipeline of ESPnet2 recipes will take care of the `RESULTS.md` generation, model packing, and uploading. ESPnet2 models are maintained at Zenodo and Hugging Face.
+The common pipeline of ESPnet2 recipes will take care of the `RESULTS.md` generation, model packing, and uploading. ESPnet2 models are maintained at Hugging Face and Zenodo (Deprecated).
 You can also refer to the document in https://github.com/espnet/espnet_model_zoo
-To upload your model, you need first:
+To upload your model, you need first (This is currently deprecated , uploading to Huggingface Hub is prefered) :
 1. Sign up to Zenodo: https://zenodo.org/
 2. Create access token: https://zenodo.org/account/settings/applications/tokens/new/
 3. Set your environment: % export ACCESS_TOKEN="<your token>"
@@ -64,6 +64,21 @@ To port models from zenodo using Hugging Face hub,
 1. Create a Hugging Face account - https://huggingface.co/
 2. Request to be added to espnet organisation - https://huggingface.co/espnet
 3. Go to `egs2/RECIPE/*/scripts/utils` and run `./upload_models_to_hub.sh "ZENODO_MODEL_NAME"`
+   
+To upload models using Huggingface-cli follow the following steps:
+You can also refer to https://huggingface.co/docs/transformers/model_sharing
+1. Create a Hugging Face account - https://huggingface.co/
+2. Request to be added to espnet organisation - https://huggingface.co/espnet 
+3. Run huggingface-cli login (You can get the token request at this step under setting > Access Tokens > espnet token  
+4. `huggingface-cli repo create your-model-name --organization espnet`
+5. `git clone https://huggingface.co/username/your-model-name` (clone this outside ESPNet to avoid issues as this a git repo)
+6. `cd your-model-name`
+7. `git lfs install`
+8. copy contents from exp diretory of your recipe into this directory (Check other models of similar task under ESPNet to confirm your directory structure) 
+9. `git add . `
+10. `git commit -m "Add model files"`
+11. `git push`
+12. Check if the inference demo on HF is running successfully to verify the upload      
 
 #### 1.3.3 Additional requirements for new recipe
 
@@ -76,6 +91,18 @@ to its differences.
 - If a recipe for a new corpus is proposed, you should add its name and information to:
 https://github.com/espnet/espnet/blob/master/egs/README.md if it's a ESPnet1 recipe,
 or https://github.com/espnet/espnet/blob/master/egs2/README.md + `db.sh` if it's a ESPnet2 recipe.
+   
+#### 1.3.4 Checklist before you submit the recipe-based PR
+
+- [ ] be careful about the name for the recipe. It is recommended to follow naming conventions of the other recipes
+- [ ] common/shared files are linked with **soft link** (see Section 1.3.3)
+- [ ] modified or new python scripts should be passed through **latest** black formating (by using python package black). The command to be executed could be `black espnet espnet2 test utils setup.py egs*/*/*/local egs2/TEMPLATE/asr1/pyscripts`
+- [ ] cluster settings should be set as **default** (e.g., cmd.sh conf/slurm.conf conf/queue.conf conf/pbs.conf)
+- [ ] update `egs/README.md` or `egs2/README.md` with corresponding recipes
+- [ ] add corresponding entry in `egs2/TEMPLATE/db.sh` for a new corpus
+- [ ] try to **simplify** the model configurations. We recommend to have only the best configuration for the start of a recipe. Please also follow the default rule defined in Section 1.3.3
+- [ ] large meta-information for a corpus should be maintained elsewhere other than in the recipe itself
+- [ ] recommend to also include results and pre-trained model with the recipe
 
 ## 2 Pull Request
 If your proposed feature or bugfix is ready, please open a Pull Request (PR) at https://github.com/espnet/espnet
@@ -124,6 +151,11 @@ we recommend using small model parameters and avoiding dynamic imports, file acc
 more running time, you can annotate your test with `@pytest.mark.execution_timeout(sec)`.
 - For test initialization (parameters, modules, etc), you can use pytest fixtures. Refer to  [pytest fixtures](https://docs.pytest.org/en/latest/fixture.html#using-fixtures-from-classes-modules-or-projects) for more information.
 
+In addition, please follow the [PEP 8 convention](https://peps.python.org/pep-0008/) for the coding style and [Google's convention for docstrings](https://google.github.io/styleguide/pyguide.html#383-functions-and-methods).
+Below are some specific points that should be taken care of in particular:
+- [import ordering](https://peps.python.org/pep-0008/#imports)
+- Avoid writing python2-style code. For example, `super().__init__()` is preferred over `super(CLASS_NAME, self).__init()__`.
+
 
 ### 4.2 Bash scripts
 
diff --git a/README.md b/README.md
index 0493ec5b56e..358da305ccc 100644
--- a/README.md
+++ b/README.md
@@ -2,15 +2,16 @@
 
 # ESPnet: end-to-end speech processing toolkit
 
-|system/pytorch ver.|1.3.1|1.4.0|1.5.1|1.6.0|1.7.1|1.8.1|1.9.1|1.10.2|1.11.0|
-| :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: |
-|ubuntu20/python3.10/pip|||||||||[![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions)|
-|ubuntu20/python3.9/pip|||||||||[![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions)|
-|ubuntu20/python3.8/pip|||||||||[![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions)|
+|system/pytorch ver.|1.4.0|1.5.1|1.6.0|1.7.1|1.8.1|1.9.1|1.10.2|1.11.0|
+| :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: |
+|ubuntu20/python3.10/pip||||||||[![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions)|
+|ubuntu20/python3.9/pip||||||||[![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions)|
+|ubuntu20/python3.8/pip||||||||[![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions)|
 |ubuntu18/python3.7/pip|[![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions)|[![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions)|[![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions)|[![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions)|[![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions)|[![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions)|[![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions)|[![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions)|
-|debian9/python3.7/conda|||||||||[![debian9](https://github.com/espnet/espnet/workflows/debian9/badge.svg)](https://github.com/espnet/espnet/actions?query=workflow%3Adebian9)|
-|centos7/python3.7/conda|||||||||[![centos7](https://github.com/espnet/espnet/workflows/centos7/badge.svg)](https://github.com/espnet/espnet/actions?query=workflow%3Acentos7)|
-|doc/python3.8|||||||||[![doc](https://github.com/espnet/espnet/workflows/doc/badge.svg)](https://github.com/espnet/espnet/actions?query=workflow%3Adoc)|
+|debian9/python3.7/conda||||||||[![debian9](https://github.com/espnet/espnet/workflows/debian9/badge.svg)](https://github.com/espnet/espnet/actions?query=workflow%3Adebian9)|
+|centos7/python3.7/conda||||||||[![centos7](https://github.com/espnet/espnet/workflows/centos7/badge.svg)](https://github.com/espnet/espnet/actions?query=workflow%3Acentos7)|
+|doc/python3.8||||||||[![doc](https://github.com/espnet/espnet/workflows/doc/badge.svg)](https://github.com/espnet/espnet/actions?query=workflow%3Adoc)|
+
 
 [![PyPI version](https://badge.fury.io/py/espnet.svg)](https://badge.fury.io/py/espnet)
 [![Python Versions](https://img.shields.io/pypi/pyversions/espnet.svg)](https://pypi.org/project/espnet/)
@@ -36,7 +37,7 @@ ESPnet uses [pytorch](http://pytorch.org/) as a deep learning engine and also fo
 - Support numbers of `ASR` recipes (WSJ, Switchboard, CHiME-4/5, Librispeech, TED, CSJ, AMI, HKUST, Voxforge, REVERB, etc.)
 - Support numbers of `TTS` recipes with a similar manner to the ASR recipe (LJSpeech, LibriTTS, M-AILABS, etc.)
 - Support numbers of `ST` recipes (Fisher-CallHome Spanish, Libri-trans, IWSLT'18, How2, Must-C, Mboshi-French, etc.)
-- Support numbers of `MT` recipes (IWSLT'16, the above ST recipes etc.)
+- Support numbers of `MT` recipes (IWSLT'14, IWSLT'16, the above ST recipes etc.)
 - Support numbers of `SLU` recipes (CATSLU-MAPS, FSC, Grabo, IEMOCAP, JDCINAL, SNIPS, SLURP, SWBD-DA, etc.)
 - Support numbers of `SE/SS` recipes (DNS-IS2020, LibriMix, SMS-WSJ, VCTK-noisyreverb, WHAM!, WHAMR!, WSJ-2mix, etc.)
 - Support voice conversion recipe (VCC2020 baseline)
@@ -78,7 +79,11 @@ ESPnet uses [pytorch](http://pytorch.org/) as a deep learning engine and also fo
 - Self-supervised learning representations as features, using upstream models in [S3PRL](https://github.com/s3prl/s3prl) in frontend.
   - Set `frontend` to be `s3prl`
   - Select any upstream model by setting the `frontend_conf` to the corresponding name.
+- Transfer Learning : 
+  - easy usage and transfers from models previously trained by your group, or models from [ESPnet huggingface repository](https://huggingface.co/espnet).
+  - [Documentation](https://github.com/espnet/espnet/tree/master/egs2/mini_an4/asr1/transfer_learning.md) and [toy example runnable on colab](https://github.com/espnet/notebook/blob/master/espnet2_asr_transfer_learning_demo.ipynb).
 - Streaming Transformer/Conformer ASR with blockwise synchronous beam search.
+- Restricted Self-Attention based on [Longformer](https://arxiv.org/abs/2004.05150) as an encoder for long sequences 
 
 Demonstration
 - Real-time ASR demo with ESPnet2  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/espnet/notebook/blob/master/espnet2_asr_realtime_demo.ipynb)
@@ -94,7 +99,7 @@ Demonstration
     - Conformer FastSpeech & FastSpeech2
     - VITS
 - Multi-speaker & multi-language extention
-    - Pretrined speaker embedding (e.g., X-vector)
+    - Pretrained speaker embedding (e.g., X-vector)
     - Speaker ID embedding
     - Language ID embedding
     - Global style token (GST) embedding
@@ -130,7 +135,7 @@ To train the neural vocoder, please check the following repositories:
 - Multi-speaker speech separation
 - Unified encoder-separator-decoder structure for time-domain and frequency-domain models
   - Encoder/Decoder: STFT/iSTFT, Convolution/Transposed-Convolution
-  - Separators: BLSTM, Transformer, Conformer, DPRNN, Neural Beamformers, etc.
+  - Separators: BLSTM, Transformer, Conformer, [TasNet](https://arxiv.org/abs/1809.07454), [DPRNN](https://arxiv.org/abs/1910.06379), [SkiM](https://arxiv.org/abs/2201.10800), [SVoice](https://arxiv.org/abs/2011.02329), [DC-CRN](https://web.cse.ohio-state.edu/~wang.77/papers/TZW.taslp21.pdf), [DCCRN](https://arxiv.org/abs/2008.00264), [Deep Clustering](https://ieeexplore.ieee.org/document/7471631), [Deep Attractor Network](https://pubmed.ncbi.nlm.nih.gov/29430212/), [FaSNet](https://arxiv.org/abs/1909.13387), [iFaSNet](https://arxiv.org/abs/1910.14104), Neural Beamformers, etc.
 - Flexible ASR integration: working as an individual task or as the ASR frontend
 - Easy to import pretrained models from [Asteroid](https://github.com/asteroid-team/asteroid)
   - Both the pre-trained models from Asteroid and the specific configuration are supported.
@@ -138,7 +143,6 @@ To train the neural vocoder, please check the following repositories:
 Demonstration
 - Interactive SE demo with ESPnet2 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1fjRJCh96SoYLZPRxsjF9VDv4Q2VoIckI?usp=sharing)
 
-
 ### ST: Speech Translation & MT: Machine Translation
 - **State-of-the-art performance** in several ST benchmarks (comparable/superior to cascaded ASR and MT)
 - Transformer based end-to-end ST (new!)
@@ -149,9 +153,34 @@ Demonstration
 - End-to-end VC based on cascaded ASR+TTS (Baseline system for Voice Conversion Challenge 2020!)
 
 ### SLU: Speech Language Understanding
-- Predicting intent by directly classifying it as one of intent or decoding by character
-- Transformer & RNN based encoder-decoder model
-- Establish SOTA results with spectral augmentation (Performs better than reported results of pretrained model on Fluent Speech Command Dataset)
+- Architecture
+    - Transformer based Encoder
+    - Conformer based Encoder
+    - RNN based Decoder
+    - Transformer based Decoder
+- Support Multitasking with ASR
+    - Predict both intent and ASR transcript
+- Support Multitasking with NLU
+    - Deliberation encoder based 2 pass model
+- Support using pretrained ASR models
+    - Hubert
+    - Wav2vec2
+    - VQ-APC
+    - TERA and more ...
+- Support using pretrained NLP models
+    - BERT
+    - MPNet And more...
+- Various language support
+    - En / Jp / Zn / Nl / And more...
+- Supports using context from previous utterances
+- Supports using other tasks like SE in pipeline manner
+Demonstration
+- Performing noisy spoken language understanding using speech enhancement model followed by spoken language understanding model.  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/14nCrJ05vJcQX0cJuXjbMVFWUHJ3Wfb6N?usp=sharing)
+- Integrated to [Huggingface Spaces](https://huggingface.co/spaces) with [Gradio](https://github.com/gradio-app/gradio). See SLU demo on multiple languages: [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/Siddhant/ESPnet2-SLU)
+
+
+### SUM: Speech Summarization
+- End to End Speech Summarization Recipe for Instructional Videos using Restricted Self-Attention [[Sharma et al., 2022]](https://arxiv.org/abs/2110.06263)
 
 ### DNN Framework
 - Flexible network architecture thanks to chainer and pytorch
@@ -215,22 +244,22 @@ You can find useful tutorials and demos in [Interspeech 2019 Tutorial](https://g
 
 We list the character error rate (CER) and word error rate (WER) of major ASR tasks.
 
-| Task                   | CER (%) | WER (%) | Pretrained model|
-| -----------            | :----:  | :----:  | :----:                                                                                                                                                                |
-| Aishell dev/test            | 4.6/5.1    | N/A     | [link](https://github.com/espnet/espnet/blob/master/egs/aishell/asr1/RESULTS.md#conformer-kernel-size--15--specaugment--lm-weight--00-result) |
-| **ESPnet2** Aishell dev/test            | 4.4/4.7    | N/A     | [link](https://github.com/espnet/espnet/tree/master/egs2/aishell/asr1#conformer--specaug--speed-perturbation-featsraw-n_fft512-hop_length128) |
-| Common Voice dev/test       | 1.7/1.8     | 2.2/2.3     | [link](https://github.com/espnet/espnet/blob/master/egs/commonvoice/asr1/RESULTS.md#first-results-default-pytorch-transformer-setting-with-bpe-100-epochs-single-gpu) |
-| CSJ eval1/eval2/eval3              | 5.7/3.8/4.2     | N/A     | [link](https://github.com/espnet/espnet/blob/master/egs/csj/asr1/RESULTS.md#pytorch-backend-transformer-without-any-hyperparameter-tuning)                            |
-| **ESPnet2** CSJ eval1/eval2/eval3              | 4.5/3.3/3.6     | N/A     | [link](https://github.com/espnet/espnet/tree/master/egs2/csj/asr1#initial-conformer-results)                            |
-| HKUST dev              | 23.5    | N/A     | [link](https://github.com/espnet/espnet/blob/master/egs/hkust/asr1/RESULTS.md#transformer-only-20-epochs)                                                             |
-|  **ESPnet2** HKUST dev              | 21.2    | N/A     | [link](https://github.com/espnet/espnet/tree/master/egs2/hkust/asr1#transformer-asr--transformer-lm)                                                             |
-| Librispeech dev_clean/dev_other/test_clean/test_other  | N/A     | 1.9/4.9/2.1/4.9     | [link](https://github.com/espnet/espnet/blob/master/egs/librispeech/asr1/RESULTS.md#pytorch-large-conformer-with-specaug--speed-perturbation-8-gpus--transformer-lm-4-gpus)             |
-| **ESPnet2** Librispeech dev_clean/dev_other/test_clean/test_other  | 0.6/1.5/0.6/1.4    | 1.7/3.4/1.8/3.6     | [link](https://github.com/espnet/espnet/tree/master/egs2/librispeech/asr1#self-supervised-learning-features-hubert_large_ll60k-conformer-utt_mvn-with-transformer-lm)             |
-| Switchboard (eval2000) callhm/swbd           | N/A     | 14.0/6.8     | [link](https://github.com/espnet/espnet/blob/master/egs/swbd/asr1/RESULTS.md#conformer-with-bpe-2000-specaug-speed-perturbation-transformer-lm-decoding)   |
-| TEDLIUM2 dev/test           | N/A     | 8.6/7.2     | [link](https://github.com/espnet/espnet/blob/master/egs/tedlium2/asr1/RESULTS.md#conformer-large-model--specaug--speed-perturbation--rnnlm)   |
-| TEDLIUM3 dev/test           | N/A     | 9.6/7.6     | [link](https://github.com/espnet/espnet/blob/master/egs/tedlium3/asr1/RESULTS.md)                   |
-| WSJ dev93/eval92              | 3.2/2.1     | 7.0/4.7     | N/A |
-|  **ESPnet2** WSJ dev93/eval92              | 1.1/0.8     | 2.8/1.8     | [link](https://github.com/espnet/espnet/tree/master/egs2/wsj/asr1#self-supervised-learning-features-wav2vec2_large_ll60k-conformer-utt_mvn-with-transformer-lm) |
+| Task                                                              |     CER (%)     |     WER (%)     |                                                                              Pretrained model                                                                               |
+| ----------------------------------------------------------------- | :-------------: | :-------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| Aishell dev/test                                                  |     4.6/5.1     |       N/A       |                [link](https://github.com/espnet/espnet/blob/master/egs/aishell/asr1/RESULTS.md#conformer-kernel-size--15--specaugment--lm-weight--00-result)                |
+| **ESPnet2** Aishell dev/test                                      |     4.4/4.7     |       N/A       |                [link](https://github.com/espnet/espnet/tree/master/egs2/aishell/asr1#conformer--specaug--speed-perturbation-featsraw-n_fft512-hop_length128)                |
+| Common Voice dev/test                                             |     1.7/1.8     |     2.2/2.3     |    [link](https://github.com/espnet/espnet/blob/master/egs/commonvoice/asr1/RESULTS.md#first-results-default-pytorch-transformer-setting-with-bpe-100-epochs-single-gpu)    |
+| CSJ eval1/eval2/eval3                                             |   5.7/3.8/4.2   |       N/A       |                 [link](https://github.com/espnet/espnet/blob/master/egs/csj/asr1/RESULTS.md#pytorch-backend-transformer-without-any-hyperparameter-tuning)                  |
+| **ESPnet2** CSJ eval1/eval2/eval3                                 |   4.5/3.3/3.6   |       N/A       |                                        [link](https://github.com/espnet/espnet/tree/master/egs2/csj/asr1#initial-conformer-results)                                         |
+| HKUST dev                                                         |      23.5       |       N/A       |                                  [link](https://github.com/espnet/espnet/blob/master/egs/hkust/asr1/RESULTS.md#transformer-only-20-epochs)                                  |
+| **ESPnet2** HKUST dev                                             |      21.2       |       N/A       |                                    [link](https://github.com/espnet/espnet/tree/master/egs2/hkust/asr1#transformer-asr--transformer-lm)                                     |
+| Librispeech dev_clean/dev_other/test_clean/test_other             |       N/A       | 1.9/4.9/2.1/4.9 | [link](https://github.com/espnet/espnet/blob/master/egs/librispeech/asr1/RESULTS.md#pytorch-large-conformer-with-specaug--speed-perturbation-8-gpus--transformer-lm-4-gpus) |
+| **ESPnet2** Librispeech dev_clean/dev_other/test_clean/test_other | 0.6/1.5/0.6/1.4 | 1.7/3.4/1.8/3.6 |    [link](https://github.com/espnet/espnet/tree/master/egs2/librispeech/asr1#self-supervised-learning-features-hubert_large_ll60k-conformer-utt_mvn-with-transformer-lm)    |
+| Switchboard (eval2000) callhm/swbd                                |       N/A       |    14.0/6.8     |          [link](https://github.com/espnet/espnet/blob/master/egs/swbd/asr1/RESULTS.md#conformer-with-bpe-2000-specaug-speed-perturbation-transformer-lm-decoding)           |
+| TEDLIUM2 dev/test                                                 |       N/A       |     8.6/7.2     |                 [link](https://github.com/espnet/espnet/blob/master/egs/tedlium2/asr1/RESULTS.md#conformer-large-model--specaug--speed-perturbation--rnnlm)                 |
+| TEDLIUM3 dev/test                                                 |       N/A       |     9.6/7.6     |                                              [link](https://github.com/espnet/espnet/blob/master/egs/tedlium3/asr1/RESULTS.md)                                              |
+| WSJ dev93/eval92                                                  |     3.2/2.1     |     7.0/4.7     |                                                                                     N/A                                                                                     |
+| **ESPnet2** WSJ dev93/eval92                                      |     1.1/0.8     |     2.8/1.8     |       [link](https://github.com/espnet/espnet/tree/master/egs2/wsj/asr1#self-supervised-learning-features-wav2vec2_large_ll60k-conformer-utt_mvn-with-transformer-lm)       |
 
 Note that the performance of the CSJ, HKUST, and Librispeech tasks was significantly improved by using the wide network (#units = 1024) and large subword units if necessary reported by [RWTH](https://arxiv.org/pdf/1805.03294.pdf).
 
@@ -257,7 +286,7 @@ The sampling rate must be consistent with that of data used in training.
 Available pretrained models in the demo script are listed as below.
 
 | Model                                                                                            | Notes                                                      |
-| :------                                                                                          | :------                                                    |
+| :----------------------------------------------------------------------------------------------- | :--------------------------------------------------------- |
 | [tedlium2.rnn.v1](https://drive.google.com/open?id=1UqIY6WJMZ4sxNxSugUqp3mrGb3j6h7xe)            | Streaming decoding based on CTC-based VAD                  |
 | [tedlium2.rnn.v2](https://drive.google.com/open?id=1cac5Uc09lJrCYfWkLQsF8eapQcxZnYdf)            | Streaming decoding based on CTC-based VAD (batch decoding) |
 | [tedlium2.transformer.v1](https://drive.google.com/open?id=1cVeSOYY1twOfL9Gns7Z3ZDnkrJqNwPow)    | Joint-CTC attention Transformer trained on Tedlium 2       |
@@ -274,11 +303,11 @@ Available pretrained models in the demo script are listed as below.
 
 We list results from three different models on WSJ0-2mix, which is one the most widely used benchmark dataset for speech separation.
 
-|Model|STOI|SAR|SDR|SIR|
-|---|---|---|---|---|
-|[TF Masking](https://zenodo.org/record/4498554)|0.89|11.40|10.24|18.04|
-|[Conv-Tasnet](https://zenodo.org/record/4498562)|0.95|16.62|15.94|25.90|
-|[DPRNN-Tasnet](https://zenodo.org/record/4688000)|0.96|18.82|18.29|28.92|
+| Model                                             | STOI | SAR   | SDR   | SIR   |
+| ------------------------------------------------- | ---- | ----- | ----- | ----- |
+| [TF Masking](https://zenodo.org/record/4498554)   | 0.89 | 11.40 | 10.24 | 18.04 |
+| [Conv-Tasnet](https://zenodo.org/record/4498562)  | 0.95 | 16.62 | 15.94 | 25.90 |
+| [DPRNN-Tasnet](https://zenodo.org/record/4688000) | 0.96 | 18.82 | 18.29 | 28.92 |
 
 </div></details>
 
@@ -300,23 +329,23 @@ It is based on ESPnet2. Pretrained models are available for both speech enhancem
 We list 4-gram BLEU of major ST tasks.
 
 #### end-to-end system
-| Task | BLEU | Pretrained model |
-| ---- | :----: | :----: |
+| Task                                              | BLEU  |                                                                                         Pretrained model                                                                                          |
+| ------------------------------------------------- | :---: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
 | Fisher-CallHome Spanish fisher_test (Es->En)      | 51.03 | [link](https://github.com/espnet/espnet/blob/master/egs/fisher_callhome_spanish/st1/RESULTS.md#train_spen_lcrm_pytorch_train_pytorch_transformer_bpe_short_long_bpe1000_specaug_asrtrans_mttrans) |
 | Fisher-CallHome Spanish callhome_evltest (Es->En) | 20.44 | [link](https://github.com/espnet/espnet/blob/master/egs/fisher_callhome_spanish/st1/RESULTS.md#train_spen_lcrm_pytorch_train_pytorch_transformer_bpe_short_long_bpe1000_specaug_asrtrans_mttrans) |
-| Libri-trans test (En->Fr)                         | 16.70 | [link](https://github.com/espnet/espnet/blob/master/egs/libri_trans/st1/RESULTS.md#train_spfr_lc_pytorch_train_pytorch_transformer_bpe_short_long_bpe1000_specaug_asrtrans_mttrans-1) |
-| How2 dev5 (En->Pt)                                | 45.68 | [link](https://github.com/espnet/espnet/blob/master/egs/how2/st1/RESULTS.md#trainpt_tc_pytorch_train_pytorch_transformer_short_long_bpe8000_specaug_asrtrans_mttrans-1) |
-| Must-C tst-COMMON (En->De)                        | 22.91 | [link](https://github.com/espnet/espnet/blob/master/egs/must_c/st1/RESULTS.md#train_spen-dede_tc_pytorch_train_pytorch_transformer_short_long_bpe8000_specaug_asrtrans_mttrans) |
-| Mboshi-French dev (Fr->Mboshi)                    | 6.18  | N/A  |
+| Libri-trans test (En->Fr)                         | 16.70 |       [link](https://github.com/espnet/espnet/blob/master/egs/libri_trans/st1/RESULTS.md#train_spfr_lc_pytorch_train_pytorch_transformer_bpe_short_long_bpe1000_specaug_asrtrans_mttrans-1)       |
+| How2 dev5 (En->Pt)                                | 45.68 |              [link](https://github.com/espnet/espnet/blob/master/egs/how2/st1/RESULTS.md#trainpt_tc_pytorch_train_pytorch_transformer_short_long_bpe8000_specaug_asrtrans_mttrans-1)              |
+| Must-C tst-COMMON (En->De)                        | 22.91 |          [link](https://github.com/espnet/espnet/blob/master/egs/must_c/st1/RESULTS.md#train_spen-dede_tc_pytorch_train_pytorch_transformer_short_long_bpe8000_specaug_asrtrans_mttrans)          |
+| Mboshi-French dev (Fr->Mboshi)                    | 6.18  |                                                                                                N/A                                                                                                |
 
 #### cascaded system
-| Task | BLEU | Pretrained model |
-| ---- | :----: | :----: |
-| Fisher-CallHome Spanish fisher_test (Es->En)      | 42.16 | N/A  |
-| Fisher-CallHome Spanish callhome_evltest (Es->En) | 19.82 | N/A  |
-| Libri-trans test (En->Fr)                         | 16.96 | N/A  |
-| How2 dev5 (En->Pt)                                | 44.90 | N/A  |
-| Must-C tst-COMMON (En->De)                        | 23.65 | N/A  |
+| Task                                              | BLEU  | Pretrained model |
+| ------------------------------------------------- | :---: | :--------------: |
+| Fisher-CallHome Spanish fisher_test (Es->En)      | 42.16 |       N/A        |
+| Fisher-CallHome Spanish callhome_evltest (Es->En) | 19.82 |       N/A        |
+| Libri-trans test (En->Fr)                         | 16.96 |       N/A        |
+| How2 dev5 (En->Pt)                                | 44.90 |       N/A        |
+| Must-C tst-COMMON (En->De)                        | 23.65 |       N/A        |
 
 If you want to check the results of the other recipes, please check `egs/<name_of_recipe>/st1/RESULTS.md`.
 
@@ -349,9 +378,9 @@ The sampling rate must be consistent with that of data used in training.
 
 Available pretrained models in the demo script are listed as below.
 
-| Model                                                                                            | Notes                                                      |
-| :------                                                                                          | :------                                                    |
-| [fisher_callhome_spanish.transformer.v1](https://drive.google.com/open?id=1hawp5ZLw4_SIHIT3edglxbKIIkPVe8n3)            | Transformer-ST trained on Fisher-CallHome Spanish Es->En                  |
+| Model                                                                                                        | Notes                                                    |
+| :----------------------------------------------------------------------------------------------------------- | :------------------------------------------------------- |
+| [fisher_callhome_spanish.transformer.v1](https://drive.google.com/open?id=1hawp5ZLw4_SIHIT3edglxbKIIkPVe8n3) | Transformer-ST trained on Fisher-CallHome Spanish Es->En |
 
 </div></details>
 
@@ -360,17 +389,18 @@ Available pretrained models in the demo script are listed as below.
 
 <details><summary>expand</summary><div>
 
-| Task | BLEU | Pretrained model |
-| ---- | :----: | :----: |
+| Task                                              | BLEU  |                                                                        Pretrained model                                                                         |
+| ------------------------------------------------- | :---: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------: |
 | Fisher-CallHome Spanish fisher_test (Es->En)      | 61.45 | [link](https://github.com/espnet/espnet/blob/master/egs/fisher_callhome_spanish/mt1/RESULTS.md#trainen_lcrm_lcrm_pytorch_train_pytorch_transformer_bpe_bpe1000) |
 | Fisher-CallHome Spanish callhome_evltest (Es->En) | 29.86 | [link](https://github.com/espnet/espnet/blob/master/egs/fisher_callhome_spanish/mt1/RESULTS.md#trainen_lcrm_lcrm_pytorch_train_pytorch_transformer_bpe_bpe1000) |
-| Libri-trans test (En->Fr)                         | 18.09 | [link](https://github.com/espnet/espnet/blob/master/egs/libri_trans/mt1/RESULTS.md#trainfr_lcrm_tc_pytorch_train_pytorch_transformer_bpe1000) |
-| How2 dev5 (En->Pt)                                | 58.61 | [link](https://github.com/espnet/espnet/blob/master/egs/how2/mt1/RESULTS.md#trainpt_tc_tc_pytorch_train_pytorch_transformer_bpe8000) |
-| Must-C tst-COMMON (En->De)                        | 27.63 | [link](https://github.com/espnet/espnet/blob/master/egs/must_c/mt1/RESULTS.md#summary-4-gram-bleu) |
-| IWSLT'14 test2014 (En->De)                        | 24.70 | [link](https://github.com/espnet/espnet/blob/master/egs/iwslt16/mt1/RESULTS.md#result) |
-| IWSLT'14 test2014 (De->En)                        | 29.22 | [link](https://github.com/espnet/espnet/blob/master/egs/iwslt16/mt1/RESULTS.md#result) |
-| IWSLT'16 test2014 (En->De)                        | 24.05 | [link](https://github.com/espnet/espnet/blob/master/egs/iwslt16/mt1/RESULTS.md#result) |
-| IWSLT'16 test2014 (De->En)                        | 29.13 | [link](https://github.com/espnet/espnet/blob/master/egs/iwslt16/mt1/RESULTS.md#result) |
+| Libri-trans test (En->Fr)                         | 18.09 |          [link](https://github.com/espnet/espnet/blob/master/egs/libri_trans/mt1/RESULTS.md#trainfr_lcrm_tc_pytorch_train_pytorch_transformer_bpe1000)          |
+| How2 dev5 (En->Pt)                                | 58.61 |              [link](https://github.com/espnet/espnet/blob/master/egs/how2/mt1/RESULTS.md#trainpt_tc_tc_pytorch_train_pytorch_transformer_bpe8000)               |
+| Must-C tst-COMMON (En->De)                        | 27.63 |                               [link](https://github.com/espnet/espnet/blob/master/egs/must_c/mt1/RESULTS.md#summary-4-gram-bleu)                                |
+| IWSLT'14 test2014 (En->De)                        | 24.70 |                                     [link](https://github.com/espnet/espnet/blob/master/egs/iwslt16/mt1/RESULTS.md#result)                                      |
+| IWSLT'14 test2014 (De->En)                        | 29.22 |                                     [link](https://github.com/espnet/espnet/blob/master/egs/iwslt16/mt1/RESULTS.md#result)                                      |
+| IWSLT'14 test2014 (De->En)                        | 32.2  | [link](https://github.com/espnet/espnet/blob/master/egs2/iwslt14/mt1/README.md)  |
+| IWSLT'16 test2014 (En->De)                        | 24.05 |                                     [link](https://github.com/espnet/espnet/blob/master/egs/iwslt16/mt1/RESULTS.md#result)                                      |
+| IWSLT'16 test2014 (De->En)                        | 29.13 |                                     [link](https://github.com/espnet/espnet/blob/master/egs/iwslt16/mt1/RESULTS.md#result)                                      |
 
 </div></details>
 
@@ -430,19 +460,19 @@ If you want to build your own neural vocoder, please check the above repositorie
 Here we list all of the pretrained neural vocoders. Please download and enjoy the generation of high quality speech!
 
 | Model link                                                                                           | Lang  | Fs [Hz] | Mel range [Hz] | FFT / Shift / Win [pt] | Model type                                                              |
-| :------                                                                                              | :---: | :----:  | :--------:     | :---------------:      | :------                                                                 |
-| [ljspeech.wavenet.softmax.ns.v1](https://drive.google.com/open?id=1eA1VcRS9jzFa-DovyTgJLQ_jmwOLIi8L) | EN    | 22.05k  | None           | 1024 / 256 / None      | [Softmax WaveNet](https://github.com/kan-bayashi/PytorchWaveNetVocoder) |
-| [ljspeech.wavenet.mol.v1](https://drive.google.com/open?id=1sY7gEUg39QaO1szuN62-Llst9TrFno2t)        | EN    | 22.05k  | None           | 1024 / 256 / None      | [MoL WaveNet](https://github.com/r9y9/wavenet_vocoder)                  |
-| [ljspeech.parallel_wavegan.v1](https://drive.google.com/open?id=1tv9GKyRT4CDsvUWKwH3s_OfXkiTi0gw7)   | EN    | 22.05k  | None           | 1024 / 256 / None      | [Parallel WaveGAN](https://github.com/kan-bayashi/ParallelWaveGAN)      |
-| [ljspeech.wavenet.mol.v2](https://drive.google.com/open?id=1es2HuKUeKVtEdq6YDtAsLNpqCy4fhIXr)        | EN    | 22.05k  | 80-7600        | 1024 / 256 / None      | [MoL WaveNet](https://github.com/r9y9/wavenet_vocoder)                  |
-| [ljspeech.parallel_wavegan.v2](https://drive.google.com/open?id=1Grn7X9wD35UcDJ5F7chwdTqTa4U7DeVB)   | EN    | 22.05k  | 80-7600        | 1024 / 256 / None      | [Parallel WaveGAN](https://github.com/kan-bayashi/ParallelWaveGAN)      |
-| [ljspeech.melgan.v1](https://drive.google.com/open?id=1ipPWYl8FBNRlBFaKj1-i23eQpW_W_YcR)             | EN    | 22.05k  | 80-7600        | 1024 / 256 / None      | [MelGAN](https://github.com/kan-bayashi/ParallelWaveGAN)                |
-| [ljspeech.melgan.v3](https://drive.google.com/open?id=1_a8faVA5OGCzIcJNw4blQYjfG4oA9VEt)             | EN    | 22.05k  | 80-7600        | 1024 / 256 / None      | [MelGAN](https://github.com/kan-bayashi/ParallelWaveGAN)                |
-| [libritts.wavenet.mol.v1](https://drive.google.com/open?id=1jHUUmQFjWiQGyDd7ZeiCThSjjpbF_B4h)        | EN    | 24k     | None           | 1024 / 256 / None      | [MoL WaveNet](https://github.com/r9y9/wavenet_vocoder)                  |
-| [jsut.wavenet.mol.v1](https://drive.google.com/open?id=187xvyNbmJVZ0EZ1XHCdyjZHTXK9EcfkK)            | JP    | 24k     | 80-7600        | 2048 / 300 / 1200      | [MoL WaveNet](https://github.com/r9y9/wavenet_vocoder)                  |
-| [jsut.parallel_wavegan.v1](https://drive.google.com/open?id=1OwrUQzAmvjj1x9cDhnZPp6dqtsEqGEJM)       | JP    | 24k     | 80-7600        | 2048 / 300 / 1200      | [Parallel WaveGAN](https://github.com/kan-bayashi/ParallelWaveGAN)      |
-| [csmsc.wavenet.mol.v1](https://drive.google.com/open?id=1PsjFRV5eUP0HHwBaRYya9smKy5ghXKzj)           | ZH    | 24k     | 80-7600        | 2048 / 300 / 1200      | [MoL WaveNet](https://github.com/r9y9/wavenet_vocoder)                  |
-| [csmsc.parallel_wavegan.v1](https://drive.google.com/open?id=10M6H88jEUGbRWBmU1Ff2VaTmOAeL8CEy)      | ZH    | 24k     | 80-7600        | 2048 / 300 / 1200      | [Parallel WaveGAN](https://github.com/kan-bayashi/ParallelWaveGAN)      |
+| :--------------------------------------------------------------------------------------------------- | :---: | :-----: | :------------: | :--------------------: | :---------------------------------------------------------------------- |
+| [ljspeech.wavenet.softmax.ns.v1](https://drive.google.com/open?id=1eA1VcRS9jzFa-DovyTgJLQ_jmwOLIi8L) |  EN   | 22.05k  |      None      |   1024 / 256 / None    | [Softmax WaveNet](https://github.com/kan-bayashi/PytorchWaveNetVocoder) |
+| [ljspeech.wavenet.mol.v1](https://drive.google.com/open?id=1sY7gEUg39QaO1szuN62-Llst9TrFno2t)        |  EN   | 22.05k  |      None      |   1024 / 256 / None    | [MoL WaveNet](https://github.com/r9y9/wavenet_vocoder)                  |
+| [ljspeech.parallel_wavegan.v1](https://drive.google.com/open?id=1tv9GKyRT4CDsvUWKwH3s_OfXkiTi0gw7)   |  EN   | 22.05k  |      None      |   1024 / 256 / None    | [Parallel WaveGAN](https://github.com/kan-bayashi/ParallelWaveGAN)      |
+| [ljspeech.wavenet.mol.v2](https://drive.google.com/open?id=1es2HuKUeKVtEdq6YDtAsLNpqCy4fhIXr)        |  EN   | 22.05k  |    80-7600     |   1024 / 256 / None    | [MoL WaveNet](https://github.com/r9y9/wavenet_vocoder)                  |
+| [ljspeech.parallel_wavegan.v2](https://drive.google.com/open?id=1Grn7X9wD35UcDJ5F7chwdTqTa4U7DeVB)   |  EN   | 22.05k  |    80-7600     |   1024 / 256 / None    | [Parallel WaveGAN](https://github.com/kan-bayashi/ParallelWaveGAN)      |
+| [ljspeech.melgan.v1](https://drive.google.com/open?id=1ipPWYl8FBNRlBFaKj1-i23eQpW_W_YcR)             |  EN   | 22.05k  |    80-7600     |   1024 / 256 / None    | [MelGAN](https://github.com/kan-bayashi/ParallelWaveGAN)                |
+| [ljspeech.melgan.v3](https://drive.google.com/open?id=1_a8faVA5OGCzIcJNw4blQYjfG4oA9VEt)             |  EN   | 22.05k  |    80-7600     |   1024 / 256 / None    | [MelGAN](https://github.com/kan-bayashi/ParallelWaveGAN)                |
+| [libritts.wavenet.mol.v1](https://drive.google.com/open?id=1jHUUmQFjWiQGyDd7ZeiCThSjjpbF_B4h)        |  EN   |   24k   |      None      |   1024 / 256 / None    | [MoL WaveNet](https://github.com/r9y9/wavenet_vocoder)                  |
+| [jsut.wavenet.mol.v1](https://drive.google.com/open?id=187xvyNbmJVZ0EZ1XHCdyjZHTXK9EcfkK)            |  JP   |   24k   |    80-7600     |   2048 / 300 / 1200    | [MoL WaveNet](https://github.com/r9y9/wavenet_vocoder)                  |
+| [jsut.parallel_wavegan.v1](https://drive.google.com/open?id=1OwrUQzAmvjj1x9cDhnZPp6dqtsEqGEJM)       |  JP   |   24k   |    80-7600     |   2048 / 300 / 1200    | [Parallel WaveGAN](https://github.com/kan-bayashi/ParallelWaveGAN)      |
+| [csmsc.wavenet.mol.v1](https://drive.google.com/open?id=1PsjFRV5eUP0HHwBaRYya9smKy5ghXKzj)           |  ZH   |   24k   |    80-7600     |   2048 / 300 / 1200    | [MoL WaveNet](https://github.com/r9y9/wavenet_vocoder)                  |
+| [csmsc.parallel_wavegan.v1](https://drive.google.com/open?id=10M6H88jEUGbRWBmU1Ff2VaTmOAeL8CEy)      |  ZH   |   24k   |    80-7600     |   2048 / 300 / 1200    | [Parallel WaveGAN](https://github.com/kan-bayashi/ParallelWaveGAN)      |
 
 If you want to use the above pretrained vocoders, please exactly match the feature setting with them.
 
@@ -528,11 +558,33 @@ You can download converted samples of the cascade ASR+TTS baseline system [here]
 
 ### SLU results
 
-<details><summary>ESPnet2</summary><div>
+<details><summary>expand</summary><div>
+
+
+We list the performance on various SLU tasks and dataset using the metric reported in the original dataset paper
+
+| Task                                                              | Dataset                                                              |    Metric     |     Result     |                                                                              Pretrained Model                                         |
+| ----------------------------------------------------------------- | :-------------: | :-------------: | :-------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| Intent Classification                                                 |     SLURP     |       Acc       |       86.3       |                [link](https://github.com/espnet/espnet/tree/master/egs2/slurp/asr1/README.md)                |
+| Intent Classification                                                   |     FSC     |       Acc       |       99.6       |                [link](https://github.com/espnet/espnet/tree/master/egs2/fsc/asr1/README.md)                |
+| Intent Classification                                                  |     FSC Unseen Speaker Set     |       Acc       |       98.6       |                [link](https://github.com/espnet/espnet/tree/master/egs2/fsc_unseen/asr1/README.md)                |
+| Intent Classification                                                   |     FSC Unseen Utterance Set     |       Acc       |       86.4       |                [link](https://github.com/espnet/espnet/tree/master/egs2/fsc_unseen/asr1/README.md)                |
+| Intent Classification                                                   |     FSC Challenge Speaker Set     |       Acc       |       97.5       |                [link](https://github.com/espnet/espnet/tree/master/egs2/fsc_challenge/asr1/README.md)                |
+| Intent Classification                                                   |     FSC Challenge Utterance Set     |       Acc       |       78.5       |                [link](https://github.com/espnet/espnet/tree/master/egs2/fsc_challenge/asr1/README.md)                |
+| Intent Classification                                                   |     SNIPS     |       F1       |       91.7       |                [link](https://github.com/espnet/espnet/tree/master/egs2/snips/asr1/README.md)                |
+| Intent Classification                                                   |     Grabo (Nl)   |       Acc       |       97.2       |                [link](https://github.com/espnet/espnet/tree/master/egs2/grabo/asr1/README.md)                |
+| Intent Classification                                                   |     CAT SLU MAP (Zn)     |       Acc       |       78.9       |                [link](https://github.com/espnet/espnet/tree/master/egs2/catslu/asr1/README.md)                |
+| Intent Classification                                                  |     Google Speech Commands    |       Acc       |       98.4       |                [link](https://github.com/espnet/espnet/tree/master/egs2/speechcommands/asr1/README.md)                |
+| Slot Filling                                                  |     SLURP     |       SLU-F1       |       71.9       |                [link](https://github.com/espnet/espnet/tree/master/egs2/slurp_entity/asr1/README.md)                |
+| Dialogue  Act Classification                                                 |     Switchboard     |       Acc       |       67.5       |                [link](https://github.com/espnet/espnet/tree/master/egs2/swbd_da/asr1/README.md)                |
+| Dialogue  Act Classification                                                 |     Jdcinal (Jp)    |       Acc       |       67.4       |                [link](https://github.com/espnet/espnet/tree/master/egs2/jdcinal/asr1/README.md)                |
+| Emotion Recognition                                                  |     IEMOCAP     |       Acc       |       69.4       |                [link](https://github.com/espnet/espnet/tree/master/egs2/iemocap/asr1/README.md)                |
+| Emotion Recognition                                                  |     swbd_sentiment     |       Macro F1       |       61.4       |                [link](https://github.com/espnet/espnet/tree/master/egs2/swbd_sentiment/asr1/README.md)                | 
+| Emotion Recognition                                                  |     slue_voxceleb     |       Macro F1       |       44.0       |                [link](https://github.com/espnet/espnet/tree/master/egs2/slue-voxceleb/asr1/README.md)                | 
 
-- Transformer based SLU for Fluent Speech Command Dataset
+ 
+If you want to check the results of the other recipes, please check `egs2/<name_of_recipe>/asr1/RESULTS.md`.
 
-In SLU, The objective is to infer the meaning or intent of spoken utterance. The [Fluent Speech Command Dataset](https://fluent.ai/fluent-speech-commands-a-dataset-for-spoken-language-understanding-research/) describes an intent as combination of 3 slot values: action, object and location. You can see baseline results on this dataset [here](https://github.com/espnet/espnet/blob/master/egs2/fsc/asr1/RESULTS.md)
 
 
 </div></details>
@@ -685,6 +737,8 @@ See the module documentation for more information.
 It is recommended to use models with RNN-based encoders (such as BLSTMP) for aligning large audio files;
 rather than using Transformer models that have a high memory consumption on longer audio data.
 The sample rate of the audio must be consistent with that of the data used in training; adjust with `sox` if needed.
+  
+Also, we can use this tool to provide token-level segmentation information if we prepare a list of tokens instead of that of utterances in the `text` file. See the discussion in https://github.com/espnet/espnet/issues/4278#issuecomment-1100756463.
 
 </div></details>
 
diff --git a/ci/doc.sh b/ci/doc.sh
index cbcd78f4b21..114bc92b952 100755
--- a/ci/doc.sh
+++ b/ci/doc.sh
@@ -26,6 +26,8 @@ set -euo pipefail
 find ./utils/{*.sh,spm_*} -exec ./doc/usage2rst.sh {} \; | tee ./doc/_gen/utils_sh.rst
 find ./espnet2/bin/*.py -exec ./doc/usage2rst.sh {} \; | tee ./doc/_gen/espnet2_bin.rst
 
+./doc/notebook2rst.sh > ./doc/_gen/notebooks.rst
+
 # generate package doc
 ./doc/module2rst.py --root espnet espnet2 --dst ./doc --exclude espnet.bin
 
diff --git a/ci/install.sh b/ci/install.sh
index eeb531d7ddd..5bfed7584ad 100755
--- a/ci/install.sh
+++ b/ci/install.sh
@@ -21,7 +21,7 @@ ${CXX:-g++} -v
     . ./activate_python.sh
     make TH_VERSION="${TH_VERSION}"
 
-    make warp-ctc.done warp-transducer.done chainer_ctc.done nkf.done moses.done mwerSegmenter.done pesq pyopenjtalk.done py3mmseg.done s3prl.done transformers.done phonemizer.done fairseq.done k2.done gtn.done
+    make warp-ctc.done warp-transducer.done chainer_ctc.done nkf.done moses.done mwerSegmenter.done pesq pyopenjtalk.done py3mmseg.done s3prl.done transformers.done phonemizer.done fairseq.done k2.done gtn.done longformer.done
     rm -rf kaldi
 )
 . tools/activate_python.sh
diff --git a/ci/test_integration_espnet2.sh b/ci/test_integration_espnet2.sh
index 78086272af7..58951c04011 100755
--- a/ci/test_integration_espnet2.sh
+++ b/ci/test_integration_espnet2.sh
@@ -100,6 +100,50 @@ if python3 -c "import fairseq" &> /dev/null; then
     cd "${cwd}"
 fi
 
+# [ESPnet2] test enh_asr1 recipe
+if python -c 'import torch as t; from distutils.version import LooseVersion as L; assert L(t.__version__) >= L("1.2.0")' &> /dev/null;  then
+    cd ./egs2/mini_an4/enh_asr1
+    echo "==== [ESPnet2] ENH_ASR ==="
+    ./run.sh --ngpu 0 --stage 0 --stop-stage 15 --skip-upload_hf false --feats-type "raw" --spk-num 1 --enh_asr_args "--max_epoch=1 --enh_separator_conf num_spk=1" --python "${python}"
+    # Remove generated files in order to reduce the disk usage
+    rm -rf exp dump data
+    cd "${cwd}"
+fi
+
+# [ESPnet2] test st recipe
+cd ./egs2/mini_an4/st1
+echo "==== [ESPnet2] ST ==="
+./run.sh --stage 1 --stop-stage 1
+feats_types="raw fbank_pitch"
+token_types="bpe char"
+for t in ${feats_types}; do
+    ./run.sh --stage 2 --stop-stage 4 --feats-type "${t}" --python "${python}"
+done
+for t in ${token_types}; do
+    ./run.sh --stage 5 --stop-stage 5 --tgt_token_type "${t}" --src_token_type "${t}" --python "${python}"
+done
+for t in ${feats_types}; do 
+    for t2 in ${token_types}; do
+        echo "==== feats_type=${t}, token_types=${t2} ==="
+        ./run.sh --ngpu 0 --stage 6 --stop-stage 13 --skip-upload false --feats-type "${t}" --tgt_token_type "${t2}" --src_token_type "${t2}" \
+            --st-args "--max_epoch=1" --lm-args "--max_epoch=1" --inference_args "--beam_size 5" --python "${python}"
+    done
+done
+echo "==== feats_type=raw, token_types=bpe, model_conf.extract_feats_in_collect_stats=False, normalize=utt_mvn ==="
+./run.sh --ngpu 0 --stage 10 --stop-stage 13 --skip-upload false --feats-type "raw" --tgt_token_type "bpe" --src_token_type "bpe" \
+    --feats_normalize "utterance_mvn" --lm-args "--max_epoch=1" --inference_args "--beam_size 5" --python "${python}" \
+    --st-args "--model_conf extract_feats_in_collect_stats=false --max_epoch=1"
+
+echo "==== use_streaming, feats_type=raw, token_types=bpe, model_conf.extract_feats_in_collect_stats=False, normalize=utt_mvn ==="
+./run.sh --use_streaming true --ngpu 0 --stage 6 --stop-stage 13 --skip-upload false --feats-type "raw" --tgt_token_type "bpe" --src_token_type "bpe" \
+    --feats_normalize "utterance_mvn" --lm-args "--max_epoch=1" --inference_args "--beam_size 5" --python "${python}" \
+    --st-args "--model_conf extract_feats_in_collect_stats=false --max_epoch=1 --encoder=contextual_block_transformer --decoder=transformer
+                --encoder_conf block_size=40 --encoder_conf hop_size=16 --encoder_conf look_ahead=16"
+
+# Remove generated files in order to reduce the disk usage
+rm -rf exp dump data
+cd "${cwd}"
+
 # [ESPnet2] Validate configuration files
 echo "<blank>" > dummy_token_list
 echo "==== [ESPnet2] Validation configuration files ==="
@@ -124,6 +168,9 @@ if python3 -c 'import torch as t; from distutils.version import LooseVersion as
     for f in egs2/*/ssl1/conf/train*.yaml; do
         ${python} -m espnet2.bin.hubert_train --config "${f}" --iterator_type none --normalize none --dry_run true --output_dir out --token_list dummy_token_list
     done
+    for f in egs2/*/enh_asr1/conf/train_enh_asr*.yaml; do
+        ${python} -m espnet2.bin.enh_s2t_train --config "${f}" --iterator_type none --dry_run true --output_dir out --token_list dummy_token_list
+    done
 fi
 
 # These files must be same each other.
diff --git a/doc/.gitignore b/doc/.gitignore
index d4058a5aa91..79f7202744d 100644
--- a/doc/.gitignore
+++ b/doc/.gitignore
@@ -1,4 +1,4 @@
 _gen/
 _build/
 build/
-
+notebook/
\ No newline at end of file
diff --git a/doc/argparse2rst.py b/doc/argparse2rst.py
index 790049e0bc9..684673d90a3 100755
--- a/doc/argparse2rst.py
+++ b/doc/argparse2rst.py
@@ -20,11 +20,16 @@ def __init__(self, path):
 
 def get_parser():
     parser = configargparse.ArgumentParser(
-        description='generate RST from argparse options',
+        description="generate RST from argparse options",
         config_file_parser_class=configargparse.YAMLConfigFileParser,
-        formatter_class=configargparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('src', type=str, nargs='+',
-                        help='source python files that contain get_parser() func')
+        formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "src",
+        type=str,
+        nargs="+",
+        help="source python files that contain get_parser() func",
+    )
     return parser
 
 
@@ -53,7 +58,8 @@ def get_parser():
 for m in modinfo:
     cmd = m.path.name
     sep = "~" * len(cmd)
-    print(f"""
+    print(
+        f"""
 
 .. _{cmd}:
 
@@ -65,4 +71,5 @@ def get_parser():
    :func: get_parser
    :prog: {cmd}
 
-""")
+"""
+    )
diff --git a/doc/conf.py b/doc/conf.py
index 8aa97c1e42a..c2f5acd1881 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -20,8 +20,8 @@
 import os
 import sys
 
-sys.path.insert(0, os.path.abspath('../espnet/nets'))
-sys.path.insert(0, os.path.abspath('../utils'))
+sys.path.insert(0, os.path.abspath("../espnet/nets"))
+sys.path.insert(0, os.path.abspath("../utils"))
 
 # -- General configuration ------------------------------------------------
 
@@ -35,8 +35,8 @@
 extensions = [
     "nbsphinx",
     "sphinx.ext.autodoc",
-    'sphinx.ext.napoleon',
-    'sphinx.ext.viewcode',
+    "sphinx.ext.napoleon",
+    "sphinx.ext.viewcode",
     "sphinx.ext.mathjax",
     "sphinx.ext.todo",
     "sphinxarg.ext",
@@ -44,42 +44,46 @@
 ]
 
 # Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
+templates_path = ["_templates"]
 
 # The suffix(es) of source filenames.
 # You can specify multiple suffix as a list of string:
 #
 # source_suffix = '.rst'
-source_suffix = ['.rst', '.md']
+source_suffix = [".rst", ".md"]
 
 # enable to markdown
 from recommonmark.parser import CommonMarkParser
 
 source_parsers = {
-    '.md': CommonMarkParser,
+    ".md": CommonMarkParser,
 }
 
 # AutoStructify setting ref: https://qiita.com/pashango2/items/d1b379b699af85b529ce
 from recommonmark.transform import AutoStructify
 
-github_doc_root = 'https://github.com/rtfd/recommonmark/tree/master/doc/'
+github_doc_root = "https://github.com/rtfd/recommonmark/tree/master/doc/"
 
 
 def setup(app):
-    app.add_config_value('recommonmark_config', {
-        'url_resolver': lambda url: github_doc_root + url,
-        'auto_toc_tree_section': 'Contents',
-    }, True)
+    app.add_config_value(
+        "recommonmark_config",
+        {
+            "url_resolver": lambda url: github_doc_root + url,
+            "auto_toc_tree_section": "Contents",
+        },
+        True,
+    )
     app.add_transform(AutoStructify)
 
 
 # The master toctree document.
-master_doc = 'index'
+master_doc = "index"
 
 # General information about the project.
-project = u'ESPnet'
-copyright = u'2017, Shinji Watanabe'
-author = u'Shinji Watanabe'
+project = u"ESPnet"
+copyright = u"2017, Shinji Watanabe"
+author = u"Shinji Watanabe"
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
@@ -87,6 +91,7 @@ def setup(app):
 #
 # The short X.Y version.
 import espnet
+
 version = espnet.__version__
 # The full version, including alpha/beta/rc tags.
 release = espnet.__version__
@@ -102,18 +107,21 @@ def setup(app):
 # directories to ignore when looking for source files.
 # This patterns also effect to html_static_path and html_extra_path
 exclude_patterns = [
-    '_build', 'Thumbs.db', '.DS_Store', "README.md",
+    "_build",
+    "Thumbs.db",
+    ".DS_Store",
+    "README.md",
     # NOTE: because these genearate files are directly included
     # from the other files, we should exclude these files manually.
     "_gen/modules.rst",
     "_gen/utils_sh.rst",
     "_gen/utils_py.rst",
     "_gen/espnet_bin.rst",
-    "_gen/espnet-bin.rst"
+    "_gen/espnet-bin.rst",
 ]
 
 # The name of the Pygments (syntax highlighting) style to use.
-pygments_style = 'sphinx'
+pygments_style = "sphinx"
 
 # If true, `todo` and `todoList` produce output, else they produce nothing.
 todo_include_todos = False
@@ -127,7 +135,7 @@ def setup(app):
 # html_theme = 'nature'
 import sphinx_rtd_theme
 
-html_theme = 'sphinx_rtd_theme'
+html_theme = "sphinx_rtd_theme"
 html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
 
 # Theme options are theme-specific and customize the look and feel of a theme
@@ -147,16 +155,16 @@ def setup(app):
 # This is required for the alabaster theme
 # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
 html_sidebars = {
-    '**': [
-        'relations.html',  # needs 'show_related': True theme option to display
-        'searchbox.html',
+    "**": [
+        "relations.html",  # needs 'show_related': True theme option to display
+        "searchbox.html",
     ]
 }
 
 # -- Options for HTMLHelp output ------------------------------------------
 
 # Output file base name for HTML help builder.
-htmlhelp_basename = 'ESPnetdoc'
+htmlhelp_basename = "ESPnetdoc"
 
 # -- Options for LaTeX output ---------------------------------------------
 
@@ -164,15 +172,12 @@ def setup(app):
     # The paper size ('letterpaper' or 'a4paper').
     #
     # 'papersize': 'letterpaper',
-
     # The font size ('10pt', '11pt' or '12pt').
     #
     # 'pointsize': '10pt',
-
     # Additional stuff for the LaTeX preamble.
     #
     # 'preamble': '',
-
     # Latex figure (float) alignment
     #
     # 'figure_align': 'htbp',
@@ -182,18 +187,14 @@ def setup(app):
 # (source start file, target name, title,
 #  author, documentclass [howto, manual, or own class]).
 latex_documents = [
-    (master_doc, 'ESPnet.tex', u'ESPnet Documentation',
-     u'Shinji Watanabe', 'manual'),
+    (master_doc, "ESPnet.tex", u"ESPnet Documentation", u"Shinji Watanabe", "manual"),
 ]
 
 # -- Options for manual page output ---------------------------------------
 
 # One entry per manual page. List of tuples
 # (source start file, name, description, authors, manual section).
-man_pages = [
-    (master_doc, 'espnet', u'ESPnet Documentation',
-     [author], 1)
-]
+man_pages = [(master_doc, "espnet", u"ESPnet Documentation", [author], 1)]
 
 # -- Options for Texinfo output -------------------------------------------
 
@@ -201,12 +202,18 @@ def setup(app):
 # (source start file, target name, title, author,
 #  dir menu entry, description, category)
 texinfo_documents = [
-    (master_doc, 'ESPnet', u'ESPnet Documentation',
-     author, 'ESPnet', 'One line description of project.',
-     'Miscellaneous'),
+    (
+        master_doc,
+        "ESPnet",
+        u"ESPnet Documentation",
+        author,
+        "ESPnet",
+        "One line description of project.",
+        "Miscellaneous",
+    ),
 ]
 
-autoclass_content = 'both'
+autoclass_content = "both"
 
 # NOTE(kan-bayashi): Do not update outputs in notebook automatically.
-nbsphinx_execute = 'never'
+nbsphinx_execute = "never"
diff --git a/doc/espnet2_tutorial.md b/doc/espnet2_tutorial.md
index 5bdec078cc5..0dd69624a4a 100644
--- a/doc/espnet2_tutorial.md
+++ b/doc/espnet2_tutorial.md
@@ -180,7 +180,7 @@ You need to do one of the following two ways to change the training configuratio
 
 ```sh
 # Give a configuration file
-./run.sh --asr_train_config conf/train_asr.yaml
+./run.sh --asr_config conf/train_asr.yaml
 # Give arguments to "espnet2/bin/asr_train.py" directly
 ./run.sh --asr_args "--foo arg --bar arg2"
 ```
@@ -291,7 +291,7 @@ To use SSLRs in your task, you need to make several modifications.
 ### Usage
 1. To reduce the time used in `collect_stats` step, please specify `--feats_normalize uttmvn` in `run.sh` and pass it as arguments to `asr.sh` or other task-specific scripts. (Recommended)
 2. In the configuration file, specify the `frontend` and `preencoder`. Taking `HuBERT` as an example:
-   The `upsteam` name can be whatever supported in S3PRL. `multilayer-feature=True` means the final representation is a weighted-sum of all layers' hidden states from SSLR model.
+   The `upstream` name can be whatever supported in S3PRL. `multilayer-feature=True` means the final representation is a weighted-sum of all layers' hidden states from SSLR model.
    ```
    frontend: s3prl
    frontend_conf:
diff --git a/doc/index.rst b/doc/index.rst
index 13f20ab0a96..30cd3d35fd4 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -28,16 +28,7 @@ ESPnet is an end-to-end speech processing toolkit, mainly focuses on end-to-end
    ./espnet2_task.md
    ./espnet2_distributed.md
 
-.. toctree::
-   :maxdepth: 1
-   :caption: Notebook:
-
-   ./notebook/asr_cli.ipynb
-   ./notebook/asr_library.ipynb
-   ./notebook/tts_cli.ipynb
-   ./notebook/pretrained.ipynb
-   ./notebook/tts_realtime_demo.ipynb
-   ./notebook/st_demo.ipynb
+.. include:: ./_gen/notebooks.rst
 
 .. include:: ./_gen/modules.rst
 
diff --git a/doc/installation.md b/doc/installation.md
index 999082c9043..db45a09135b 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -32,14 +32,14 @@ the following packages are installed using Anaconda, so you can skip them.)
     # For CentOS
     $ sudo yum install libsndfile
     ```
-- ffmpeg (This is not required when installataion, but used in some recipes)
+- ffmpeg (This is not required when installing, but used in some recipes)
     ```sh
     # For Ubuntu
     $ sudo apt-get install ffmpeg
     # For CentOS
     $ sudo yum install ffmpeg
     ```
-- flac (This is not required when installataion, but used in some recipes)
+- flac (This is not required when installing, but used in some recipes)
     ```sh
     # For Ubuntu
     $ sudo apt-get install flac
@@ -202,14 +202,14 @@ We also have [prebuilt Kaldi binaries](https://github.com/espnet/espnet/blob/mas
 
     ```sh
     $ cd <espnet-root>/tools
-    $ make TH_VERSION=1.3.1
+    $ make TH_VERSION=1.10.1
     ```
     
     Note that the CUDA version is derived from `nvcc` command. If you'd like to specify the other CUDA version, you need to give `CUDA_VERSION`.
     
     ```sh
     $ cd <espnet-root>/tools
-    $ make TH_VERSION=1.3.1 CUDA_VERSION=10.1
+    $ make TH_VERSION=1.10.1 CUDA_VERSION=11.3
     ```
 
     If you don't have `nvcc` command, packages are installed for CPU mode by default.
diff --git a/doc/module2rst.py b/doc/module2rst.py
index a4cd4db3f6c..7cb83b9e7ad 100755
--- a/doc/module2rst.py
+++ b/doc/module2rst.py
@@ -8,15 +8,15 @@
 
 # parser
 parser = configargparse.ArgumentParser(
-    description='generate RST files from <root> module recursively into <dst>/_gen',
+    description="generate RST files from <root> module recursively into <dst>/_gen",
     config_file_parser_class=configargparse.YAMLConfigFileParser,
-    formatter_class=configargparse.ArgumentDefaultsHelpFormatter)
-parser.add_argument('--root', nargs='+',
-                    help='root module to generate docs recursively')
-parser.add_argument('--dst', type=str,
-                    help='destination path to generate RSTs')
-parser.add_argument('--exclude', nargs='*', default=[],
-                    help='exclude module name')
+    formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
+)
+parser.add_argument(
+    "--root", nargs="+", help="root module to generate docs recursively"
+)
+parser.add_argument("--dst", type=str, help="destination path to generate RSTs")
+parser.add_argument("--exclude", nargs="*", default=[], help="exclude module name")
 args = parser.parse_args()
 print(args)
 
@@ -36,12 +36,14 @@ def gen_rst(module_path, f):
     doc = module.__doc__
     if doc is None:
         doc = ""
-    f.write(f"""
+    f.write(
+        f"""
 {title}
 {sep}
 {doc}
 
-""")
+"""
+    )
 
     for cpath in glob(module_path + "/**/*.py", recursive=True):
         print(cpath)
@@ -51,7 +53,8 @@ def gen_rst(module_path, f):
             continue
         cname = to_module(cpath)
         csep = "-" * len(cname)
-        f.write(f"""
+        f.write(
+            f"""
 .. _{cname}:
 
 {cname}
@@ -62,7 +65,8 @@ def gen_rst(module_path, f):
     :undoc-members:
     :show-inheritance:
 
-""")
+"""
+        )
     f.flush()
 
 
diff --git a/doc/notebook b/doc/notebook
deleted file mode 160000
index ef3cbf880fc..00000000000
--- a/doc/notebook
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit ef3cbf880fcd725d11021e541a0cdfae4080446d
diff --git a/doc/notebook2rst.sh b/doc/notebook2rst.sh
new file mode 100755
index 00000000000..83bf7d57794
--- /dev/null
+++ b/doc/notebook2rst.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+cd "$(dirname "$0")"
+
+if [ ! -d notebook ]; then
+    git clone https://github.com/espnet/notebook --depth 1
+fi
+
+echo "\
+.. toctree::
+   :maxdepth: 1
+   :caption: Notebook:
+"
+
+find ./notebook/*.ipynb -exec echo "   {}" \;
diff --git a/docker/build.sh b/docker/build.sh
index 7bef1b94b73..987a0f54ac7 100755
--- a/docker/build.sh
+++ b/docker/build.sh
@@ -3,6 +3,10 @@
 # 2019, Nelson Yalta
 # 2019, Ludwig Kürzinger, Technische Universität München
 
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
 
 SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
 
@@ -11,10 +15,7 @@ ubuntu_ver=20.04
 cuda_ver=11.1
 build_ver=cpu
 build_cores=24
-th_ver=1.8.0
-
-docker_ver=$(docker version -f '{{.Server.Version}}')
-echo "Using Docker Ver.${docker_ver}"
+th_ver=1.10.1
 
 
 cmd_usage() {
@@ -63,16 +64,16 @@ cmd_usage() {
 
 
 build(){
-    echo "Build Latest docker containers"
+    log "Build Latest docker containers"
     # build runtime and gpu based containers
     this_tag=espnet/espnet:runtime-latest
     docker_image=$( docker images -q  ${this_tag} )
     if ! [[ -n ${docker_image} ]]; then
-        echo "Now building Runtime container"
+        log "Now building Runtime container"
         docker build --build-arg DOCKER_VER=${docker_ver} \
                     --build-arg FROM_TAG=${default_ubuntu_ver} \
                     --build-arg NUM_BUILD_CORES=${build_cores} \
-                    -f prebuilt/runtime/Dockerfile -t ${this_tag} . | tee -a build_runtime.log > /dev/null
+                    -f prebuilt/runtime.dockerfile -t ${this_tag} . | tee -a build_runtime.log > /dev/null
 
         docker_image=$( docker images -q ${this_tag} )
         [ -z "${docker_image}" ] && exit 1
@@ -81,9 +82,9 @@ build(){
     this_tag=espnet/espnet:cuda-latest
     docker_image=$( docker images -q  ${this_tag} )
     if ! [[ -n ${docker_image} ]]; then
-        echo "Now building CUDA container"
+        log "Now building CUDA container"
         docker build --build-arg FROM_TAG=runtime-latest \
-                    -f prebuilt/devel/gpu/${default_cuda_ver}/Dockerfile -t ${this_tag} . | tee -a build_cuda.log > /dev/null
+                    -f prebuilt/gpu.dockerfile -t ${this_tag} . | tee -a build_cuda.log > /dev/null
         docker_image=$( docker images -q ${this_tag} )
         [ -z "${docker_image}" ] && exit 1
     fi
@@ -93,8 +94,11 @@ build(){
     this_tag=espnet/espnet:cpu-latest
     docker_image=$( docker images -q  ${this_tag} )
     if ! [[ -n ${docker_image} ]]; then
-        echo "Now building cpu-latest with ubuntu:${default_ubuntu_ver}"
-        docker build --build-arg FROM_TAG=runtime-latest -f prebuilt/devel/Dockerfile -t ${this_tag} . | tee -a build_cpu.log > /dev/null
+        log "Now building cpu-latest with ubuntu:${default_ubuntu_ver}"
+        docker build --build-arg FROM_TAG=runtime-latest \
+                            -f prebuilt/devel.dockerfile \
+                            --target devel \
+                            -t ${this_tag} . | tee -a build_cpu.log > /dev/null
 
         docker_image=$( docker images -q ${this_tag} )
         [ -z "${docker_image}" ] && exit 1
@@ -106,8 +110,10 @@ build(){
     this_tag=espnet/espnet:gpu-latest
     docker_image=$( docker images -q ${this_tag}  )
     if ! [[ -n ${docker_image} ]]; then
-        echo "Now building gpu-latest with ubuntu:${default_ubuntu_ver} and cuda:${default_cuda_ver}"
-        docker build ${build_args} -f prebuilt/devel/Dockerfile -t ${this_tag}  . | tee -a build_gpu.log > /dev/null
+        log "Now building gpu-latest with ubuntu:${default_ubuntu_ver} and cuda:${default_cuda_ver}"
+        docker build ${build_args} -f prebuilt/devel.dockerfile \
+                            --target devel \
+                            -t ${this_tag}  . | tee -a build_gpu.log > /dev/null
         docker_image=$( docker images -q ${this_tag} )
         [ -z "${docker_image}" ] && exit 1
     fi
@@ -115,20 +121,20 @@ build(){
 
 
 build_local(){
-    echo "Building docker container: base image, and image for ${build_ver}"
+    log "Building docker container: base image, and image for ${build_ver}"
     sleep 1
 
     # prepare espnet-repo, assuming that this script is in folder espnet/docker
     cd ${SCRIPTPATH}/..
     ESPNET_ARCHIVE="./espnet-local.tar"
-    echo "Reconstructing the local repository from the last commit"
+    log "Reconstructing the local repository from the last commit"
     git archive -o docker/${ESPNET_ARCHIVE} HEAD  || exit 1
     cd ${SCRIPTPATH}
     test -r ${ESPNET_ARCHIVE} || exit 1;
     sleep 1
 
     if [ "${build_base_image}" = true ]; then
-        echo "building ESPnet base image with ubuntu:${ubuntu_ver}"
+        log "building ESPnet base image with ubuntu:${ubuntu_ver}"
         docker build --build-arg DOCKER_VER=${docker_ver} \
                     --build-arg FROM_TAG=${ubuntu_ver} \
                     --build-arg NUM_BUILD_CORES=${build_cores} \
@@ -137,11 +143,11 @@ build_local(){
     fi
 
     if [[ ${build_ver} == "cpu" ]]; then
-        echo "building ESPnet CPU Image with ubuntu:${ubuntu_ver}"
+        log "building ESPnet CPU Image with ubuntu:${ubuntu_ver}"
         docker build --build-arg FROM_TAG=runtime-local  --build-arg ESPNET_ARCHIVE=${ESPNET_ARCHIVE} \
                      -f prebuilt/local/Dockerfile -t espnet/espnet:cpu-local . || exit 1
     elif [[ ${build_ver} == "gpu" ]]; then
-        echo "building ESPnet GPU Image with ubuntu:${ubuntu_ver} and cuda:${cuda_ver}"
+        log "building ESPnet GPU Image with ubuntu:${ubuntu_ver} and cuda:${cuda_ver}"
         if [ "${build_base_image}" = true ] ; then
             docker build -f prebuilt/devel/gpu/${ver}/Dockerfile -t espnet/espnet:cuda${ver}-cudnn7 . || exit 1
         else
@@ -154,15 +160,15 @@ build_local(){
         build_args="${build_args} --build-arg ESPNET_ARCHIVE=${ESPNET_ARCHIVE}"
         docker build ${build_args} -f prebuilt/local/Dockerfile -t espnet/espnet:gpu-cuda${ver}-cudnn7-u18-local . || exit 1
     else
-        echo "Parameter invalid: " ${ver}
+        log "ERROR: Parameter invalid: " ${ver}
     fi
 
-    echo "cleanup."
+    log "cleanup."
     test -r ${ESPNET_ARCHIVE} && rm ${ESPNET_ARCHIVE}
 }
 
 run_recipe1(){
-    ./run.sh --docker-egs an4/asr1 \
+    ./run.sh --docker-egs mini_an4/asr1 \
                         --docker-cmd run.sh \
                         --docker-gpu ${1} \
                         --verbose 1 \
@@ -173,10 +179,10 @@ run_recipe1(){
 }
 
 run_recipe2(){
-   ./run.sh --docker-egs an4/asr1  \
+   ./run.sh --docker-egs mini_an4/asr1  \
                     --docker-cmd run.sh \
                     --docker-gpu ${1} \
-                    --docker-env "NLTK_DATA=/espnet/egs2/an4/asr1/nltk_data,HOME=/espnet/egs2/an4/asr1" \
+                    --docker-env "NLTK_DATA=/espnet/egs2/mini_an4/asr1/nltk_data,HOME=/espnet/egs2/mini_an4/asr1" \
                     --is-egs2 \
                     --ngpu ${2} \
                     --stage ${3} \
@@ -185,11 +191,11 @@ run_recipe2(){
 }
 
 testing(){
-    echo "Testing docker containers"
+    log "Testing docker containers"
     # Test Docker Containers with cpu setup
     run_stage=-1
     for backend in chainer pytorch; do
-        if [ -f ../egs/an4/asr1/dump/train_nodev/deltafalse/data.json ]; then 
+        if [ -f ../egs/mini_an4/asr1/dump/train_nodev/deltafalse/data.json ]; then 
             run_stage=3
         fi
         if [ ! -f .test_cpu_${backend}.done ]; then
@@ -199,7 +205,7 @@ testing(){
     done
 
     for backend in chainer pytorch; do
-        if [ -f ../egs/an4/asr1/dump/train_nodev/deltafalse/data.json ]; then 
+        if [ -f ../egs/mini_an4/asr1/dump/train_nodev/deltafalse/data.json ]; then 
             run_stage=3
         fi
         if [ ! -f .test_gpu_${backend}.done ]; then
@@ -208,7 +214,7 @@ testing(){
         fi
     done
 
-    echo "ESPnet egs Done. Press <enter> to continue with ESPnet2 egs"
+    log "ESPnet egs Done. Press <enter> to continue with ESPnet2 egs"
     read enter
     # Test for espnet2
     run_stage=-1
@@ -227,7 +233,7 @@ testing(){
 
 push(){
     for tag in runtime-latest cuda-latest cpu-latest gpu-latest;do
-        echo "docker push espnet/espnet:${tag}"
+        log "docker push espnet/espnet:${tag}"
         ( docker push espnet/espnet:${tag} )|| exit 1
     done
 }
@@ -273,14 +279,15 @@ check=true
 [ "${default_ubuntu_ver}" != "${ubuntu_ver}" ] || [ "${default_cuda_ver}" != "${cuda_ver}" ] && check=false
 
 if [ ${check} = false ] && [ "${mode}" != "fully_local" ]; then
-    echo "Error: Use of custom versions of Ubuntu (!=${default_ubuntu_ver}) and CUDA (!=${default_cuda_ver})
+    log "Error: Use of custom versions of Ubuntu (!=${default_ubuntu_ver}) and CUDA (!=${default_cuda_ver})
         is only available for <mode> == fully_local.
         Exiting... "
     exit 0;
 fi
 
+docker_ver=$(docker version -f '{{.Server.Version}}')
+log "Using Docker Ver.${docker_ver}"
 
-echo "Using Docker Ver.${docker_ver}"
 ## Application menu
 if   [[ "${mode}" == "build" ]]; then
     build
@@ -302,4 +309,4 @@ else
     cmd_usage
 fi
 
-echo "$(basename "$0") done."
+log "$(basename "$0") done."
diff --git a/docker/prebuilt/Dockerfile b/docker/espnet.dockerfile
similarity index 76%
rename from docker/prebuilt/Dockerfile
rename to docker/espnet.dockerfile
index bd2458adf70..b6295ca2a92 100644
--- a/docker/prebuilt/Dockerfile
+++ b/docker/espnet.dockerfile
@@ -11,9 +11,7 @@ ARG EXTRA_LIBS
 
 RUN if [ ${EXTRA_LIBS} = true ]; then \
         cd /espnet/tools; \
-        . ./activate_python.sh; \
-        pip install parallel_wavegan; \
-        pip install git+https://github.com/cybertronai/pytorch-lamb; \
+        make extra; \
     fi
 
 # Add user to container
diff --git a/docker/prebuilt/devel/Dockerfile b/docker/prebuilt/devel.dockerfile
similarity index 70%
rename from docker/prebuilt/devel/Dockerfile
rename to docker/prebuilt/devel.dockerfile
index ad02a540674..95dc6a41059 100644
--- a/docker/prebuilt/devel/Dockerfile
+++ b/docker/prebuilt/devel.dockerfile
@@ -1,5 +1,5 @@
 ARG FROM_TAG
-FROM espnet/espnet:${FROM_TAG}
+FROM espnet/espnet:${FROM_TAG} as devel
 LABEL maintainer "Nelson Yalta <nyalta21@gmail.com>"
 
 ARG CUDA_VER
@@ -48,7 +48,7 @@ RUN if [ -z "${CUDA_VER}" ]; then \
     . ./activate_python.sh && \
     ./installers/install_warp-ctc.sh && \
     ./installers/install_kenlm.sh && \
-    # ./installers/install_chainer_ctc.sh && \
+    ./installers/install_chainer.sh cpu && \
     conda clean --all && \
     rm -f *.tar.*  && \
     pip cache purge
@@ -56,3 +56,28 @@ RUN if [ -z "${CUDA_VER}" ]; then \
 RUN rm -rf ../espnet
 
 WORKDIR /
+
+
+#### For local docker
+FROM devel as espnet_local
+LABEL maintainer "Nelson Yalta <nyalta21@gmail.com>"
+
+ARG CUDA_VER
+WORKDIR /
+
+# IF using a local ESPNet repository, a temporary file containing the ESPnet git repo is copied over
+ARG ESPNET_ARCHIVE=./espnet-local.tar
+COPY  ${ESPNET_ARCHIVE} /espnet-local.tar
+
+
+# Download ESPnet
+RUN echo "Getting ESPnet sources from local repository, in temporary file: " ${ESPNET_ARCHIVE}
+RUN mkdir /espnet
+RUN tar xf espnet-local.tar -C /espnet/
+RUN rm espnet-local.tar
+
+RUN cd espnet && \
+    rm -rf docker egs test utils
+
+# Install espnet
+WORKDIR /espnet/tools
diff --git a/docker/prebuilt/devel/gpu/10.0/Dockerfile b/docker/prebuilt/devel/gpu/10.0/Dockerfile
deleted file mode 100644
index f28793740cb..00000000000
--- a/docker/prebuilt/devel/gpu/10.0/Dockerfile
+++ /dev/null
@@ -1,74 +0,0 @@
-ARG FROM_TAG
-FROM espnet/espnet:${FROM_TAG}
-LABEL maintainer "Nelson Yalta <nyalta21@gmail.com>"
-
-## FROM CUDA 10.0 base 
-
-RUN apt-get update && apt-get install -y --no-install-recommends gnupg2 curl ca-certificates && \
-    curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub | apt-key add - && \
-    echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \
-    echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list && \
-    apt-get purge --autoremove -y curl && \
-    rm -rf /var/lib/apt/lists/*
-
-ENV CUDA_VERSION 10.0.130
-
-ENV CUDA_PKG_VERSION 10-0=$CUDA_VERSION-1
-
-# For libraries in the cuda-compat-* package: https://docs.nvidia.com/cuda/eula/index.html#attachment-a
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        cuda-cudart-$CUDA_PKG_VERSION \
-        cuda-compat-10-0 && \
-    ln -s cuda-10.0 /usr/local/cuda && \
-    rm -rf /var/lib/apt/lists/*
-
-# Required for nvidia-docker v1
-RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \
-    echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf
-
-ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH}
-ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64
-
-# nvidia-container-runtime
-ENV NVIDIA_VISIBLE_DEVICES all
-ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
-ENV NVIDIA_REQUIRE_CUDA "cuda>=10.0 brand=tesla,driver>=384,driver<385 brand=tesla,driver>=410,driver<411"
-
-ENV CUDA_HOME /usr/local/cuda
-
-## FROM CUDA 10.0 runtime
-
-ENV NCCL_VERSION 2.4.8
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        cuda-libraries-$CUDA_PKG_VERSION \
-        cuda-nvtx-$CUDA_PKG_VERSION \
-        libnccl2=$NCCL_VERSION-1+cuda10.0 && \
-    apt-mark hold libnccl2 && \
-    rm -rf /var/lib/apt/lists/*
-
-
-## FROM CUDA 10.0 devel
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        cuda-libraries-dev-$CUDA_PKG_VERSION \
-        cuda-nvml-dev-$CUDA_PKG_VERSION \
-        cuda-minimal-build-$CUDA_PKG_VERSION \
-        cuda-command-line-tools-$CUDA_PKG_VERSION \
-        libnccl-dev=$NCCL_VERSION-1+cuda10.0 && \
-    rm -rf /var/lib/apt/lists/*
-
-ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs
-
-## FROM CUDA 10.0-CUDNN 7 devel
-
-ENV CUDNN_VERSION 7.6.5.32
-LABEL com.nvidia.cudnn.version="${CUDNN_VERSION}"
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-            libcudnn7=$CUDNN_VERSION-1+cuda10.0 \
-            libcudnn7-dev=$CUDNN_VERSION-1+cuda10.0 && \
-    apt-mark hold libcudnn7 && \
-    rm -rf /var/lib/apt/lists/*
-
-WORKDIR /
\ No newline at end of file
diff --git a/docker/prebuilt/devel/gpu/10.1/Dockerfile b/docker/prebuilt/devel/gpu/10.1/Dockerfile
deleted file mode 100644
index 044128f2910..00000000000
--- a/docker/prebuilt/devel/gpu/10.1/Dockerfile
+++ /dev/null
@@ -1,39 +0,0 @@
-ARG FROM_TAG
-FROM espnet/espnet:${FROM_TAG}
-LABEL maintainer "Nelson Yalta <nyalta21@gmail.com>"
-
-## FROM CUDA 10.1 base [https://gitlab.com/nvidia/cuda/blob/ubuntu18.04/10.1/base/Dockerfile] 
-
-RUN apt-get update && apt-get install -y --no-install-recommends gnupg2 curl ca-certificates && \
-    curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub | apt-key add - && \
-    echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \
-    echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list && \
-    apt-get purge --autoremove -y curl && \
-    rm -rf /var/lib/apt/lists/*
-
-ENV CUDA_VERSION 10.1.243
-
-ENV CUDA_PKG_VERSION 10-1=$CUDA_VERSION-1
-
-# For libraries in the cuda-compat-* package: https://docs.nvidia.com/cuda/eula/index.html#attachment-a
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        cuda-cudart-$CUDA_PKG_VERSION \
-        cuda-compat-10-1 && \
-    ln -s cuda-10.1 /usr/local/cuda && \
-    rm -rf /var/lib/apt/lists/*
-
-# Required for nvidia-docker v1
-RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \
-    echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf
-
-ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH}
-ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64
-
-# nvidia-container-runtime
-ENV NVIDIA_VISIBLE_DEVICES all
-ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
-ENV NVIDIA_REQUIRE_CUDA "cuda>=10.1 brand=tesla,driver>=396,driver<397 brand=tesla,driver>=410,driver<411 brand=tesla,driver>=418,driver<419"
-
-ENV CUDA_HOME /usr/local/cuda
-
-WORKDIR /
\ No newline at end of file
diff --git a/docker/prebuilt/devel/gpu/10.2/Dockerfile b/docker/prebuilt/devel/gpu/10.2/Dockerfile
deleted file mode 100644
index aa3d2c2d9a6..00000000000
--- a/docker/prebuilt/devel/gpu/10.2/Dockerfile
+++ /dev/null
@@ -1,81 +0,0 @@
-ARG FROM_TAG
-FROM espnet/espnet:${FROM_TAG}
-LABEL maintainer "Nelson Yalta <nyalta21@gmail.com>"
-
-## FROM CUDA 10.1 base [https://gitlab.com/nvidia/cuda/blob/ubuntu18.04/10.1/base/Dockerfile] 
-
-RUN apt-get update && apt-get install -y --no-install-recommends gnupg2 curl ca-certificates && \
-    curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub | apt-key add - && \
-    echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \
-    echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list && \
-    apt-get purge --autoremove -y curl && \
-    rm -rf /var/lib/apt/lists/*
-
-ENV CUDA_VERSION 10.1.168
-
-ENV CUDA_PKG_VERSION 10-1=$CUDA_VERSION-1
-
-# For libraries in the cuda-compat-* package: https://docs.nvidia.com/cuda/eula/index.html#attachment-a
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        cuda-cudart-$CUDA_PKG_VERSION \
-        cuda-compat-10-1 && \
-    ln -s cuda-10.1 /usr/local/cuda && \
-    rm -rf /var/lib/apt/lists/*
-
-# Required for nvidia-docker v1
-RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \
-    echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf
-
-ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH}
-ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64
-
-# nvidia-container-runtime
-ENV NVIDIA_VISIBLE_DEVICES all
-ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
-ENV NVIDIA_REQUIRE_CUDA "cuda>=10.1 brand=tesla,driver>=384,driver<385 brand=tesla,driver>=396,driver<397 brand=tesla,driver>=410,driver<411"
-
-ENV CUDA_HOME /usr/local/cuda
-
-## FROM CUDA 10.1 runtime [https://gitlab.com/nvidia/cuda/blob/ubuntu18.04/10.1/runtime/Dockerfile]
-
-ENV NCCL_VERSION 2.7.8
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        cuda-libraries-$CUDA_PKG_VERSION \
-        cuda-nvtx-$CUDA_PKG_VERSION \
-        libnccl2=$NCCL_VERSION-1+cuda10.1 && \
-    apt-mark hold libnccl2 && \
-    rm -rf /var/lib/apt/lists/*
-
-## FROM CUDA 10.1 devel [https://gitlab.com/nvidia/cuda/blob/ubuntu18.04/10.1/devel/Dockerfile]
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        cuda-nvml-dev-$CUDA_PKG_VERSION \
-        cuda-command-line-tools-$CUDA_PKG_VERSION \
-        cuda-nvprof-$CUDA_PKG_VERSION \
-        cuda-npp-dev-$CUDA_PKG_VERSION \
-        cuda-libraries-dev-$CUDA_PKG_VERSION \
-        cuda-minimal-build-$CUDA_PKG_VERSION \
-        libcublas-dev=10.2.1.243-1 \
-        libnccl-dev=2.7.8-1+cuda10.1 && \
-    apt-mark hold libnccl-dev &&  \
-    rm -rf /var/lib/apt/lists/*
-
-# apt from auto upgrading the cublas package. See https://gitlab.com/nvidia/container-images/cuda/-/issues/88
-RUN apt-mark hold libcublas-dev
-
-
-ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs
-
-## FROM CUDA 10.1-CUDNN 7 devel
-
-ENV CUDNN_VERSION 7.6.0.64
-LABEL com.nvidia.cudnn.version="${CUDNN_VERSION}"
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-            libcudnn7=$CUDNN_VERSION-1+cuda10.1 \
-            libcudnn7-dev=$CUDNN_VERSION-1+cuda10.1 && \
-    apt-mark hold libcudnn7 && \
-    rm -rf /var/lib/apt/lists/*
-
-WORKDIR /
\ No newline at end of file
diff --git a/docker/prebuilt/devel/gpu/8.0/Dockerfile b/docker/prebuilt/devel/gpu/8.0/Dockerfile
deleted file mode 100644
index 296b3286eea..00000000000
--- a/docker/prebuilt/devel/gpu/8.0/Dockerfile
+++ /dev/null
@@ -1,80 +0,0 @@
-ARG FROM_TAG
-FROM espnet/espnet:${FROM_TAG}
-LABEL maintainer "Nelson Yalta <nyalta21@gmail.com>"
-
-## FROM CUDA 8.0 runtime
-
-RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates apt-transport-https gnupg-curl && \
-    rm -rf /var/lib/apt/lists/* && \
-    NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \
-    NVIDIA_GPGKEY_FPR=ae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80 && \
-    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub && \
-    apt-key adv --export --no-emit-version -a $NVIDIA_GPGKEY_FPR | tail -n +5 > cudasign.pub && \
-    echo "$NVIDIA_GPGKEY_SUM  cudasign.pub" | sha256sum -c --strict - && rm cudasign.pub && \
-    echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list
-
-ENV CUDA_VERSION 8.0.61
-
-ENV CUDA_PKG_VERSION 8-0=$CUDA_VERSION-1
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        cuda-nvrtc-$CUDA_PKG_VERSION \
-        cuda-nvgraph-$CUDA_PKG_VERSION \
-        cuda-cusolver-$CUDA_PKG_VERSION \
-        cuda-cublas-8-0=8.0.61.2-1 \
-        cuda-cufft-$CUDA_PKG_VERSION \
-        cuda-curand-$CUDA_PKG_VERSION \
-        cuda-cusparse-$CUDA_PKG_VERSION \
-        cuda-npp-$CUDA_PKG_VERSION \
-        cuda-cudart-$CUDA_PKG_VERSION && \
-    ln -s cuda-8.0 /usr/local/cuda && \
-    rm -rf /var/lib/apt/lists/*
-
-# nvidia-docker 1.0
-LABEL com.nvidia.volumes.needed="nvidia_driver"
-LABEL com.nvidia.cuda.version="${CUDA_VERSION}"
-
-RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \
-    echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf
-
-ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH}
-ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64
-ENV CUDA_HOME /usr/local/cuda
-
-# nvidia-container-runtime
-ENV NVIDIA_VISIBLE_DEVICES all
-ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
-ENV NVIDIA_REQUIRE_CUDA "cuda>=8.0"
-
-## FROM CUDA 8.0 devel
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        cuda-core-$CUDA_PKG_VERSION \
-        cuda-misc-headers-$CUDA_PKG_VERSION \
-        cuda-command-line-tools-$CUDA_PKG_VERSION \
-        cuda-nvrtc-dev-$CUDA_PKG_VERSION \
-        cuda-nvml-dev-$CUDA_PKG_VERSION \
-        cuda-nvgraph-dev-$CUDA_PKG_VERSION \
-        cuda-cusolver-dev-$CUDA_PKG_VERSION \
-        cuda-cublas-dev-8-0=8.0.61.2-1 \
-        cuda-cufft-dev-$CUDA_PKG_VERSION \
-        cuda-curand-dev-$CUDA_PKG_VERSION \
-        cuda-cusparse-dev-$CUDA_PKG_VERSION \
-        cuda-npp-dev-$CUDA_PKG_VERSION \
-        cuda-cudart-dev-$CUDA_PKG_VERSION \
-        cuda-driver-dev-$CUDA_PKG_VERSION && \
-    rm -rf /var/lib/apt/lists/*
-
-ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs
-
-## FROM CUDA 8.0 CUDNN 7 devel
-
-RUN echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list
-
-ENV CUDNN_VERSION 7.2.1.38
-LABEL com.nvidia.cudnn.version="${CUDNN_VERSION}"
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-            libcudnn7=$CUDNN_VERSION-1+cuda8.0 \
-            libcudnn7-dev=$CUDNN_VERSION-1+cuda8.0 && \
-    apt-mark hold libcudnn7 && \
-    rm -rf /var/lib/apt/lists/*
diff --git a/docker/prebuilt/devel/gpu/9.0/Dockerfile b/docker/prebuilt/devel/gpu/9.0/Dockerfile
deleted file mode 100644
index 7bd144354fd..00000000000
--- a/docker/prebuilt/devel/gpu/9.0/Dockerfile
+++ /dev/null
@@ -1,76 +0,0 @@
-ARG FROM_TAG
-FROM espnet/espnet:${FROM_TAG}
-LABEL maintainer "Nelson Yalta <nyalta21@gmail.com>"
-
-## FROM CUDA 9.0 base
-
-RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates apt-transport-https gnupg-curl && \
-    rm -rf /var/lib/apt/lists/* && \
-    NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \
-    NVIDIA_GPGKEY_FPR=ae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80 && \
-    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub && \
-    apt-key adv --export --no-emit-version -a $NVIDIA_GPGKEY_FPR | tail -n +5 > cudasign.pub && \
-    echo "$NVIDIA_GPGKEY_SUM  cudasign.pub" | sha256sum -c --strict - && rm cudasign.pub && \
-    echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \
-    echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list
-
-ENV CUDA_VERSION 9.0.176
-
-ENV CUDA_PKG_VERSION 9-0=$CUDA_VERSION-1
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        cuda-cudart-$CUDA_PKG_VERSION && \
-    ln -s cuda-9.0 /usr/local/cuda && \
-    rm -rf /var/lib/apt/lists/*
-
-# nvidia-docker 1.0
-LABEL com.nvidia.volumes.needed="nvidia_driver"
-LABEL com.nvidia.cuda.version="${CUDA_VERSION}"
-
-RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \
-    echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf
-
-ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH}
-ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64
-ENV CUDA_HOME /usr/local/cuda
-
-# nvidia-container-runtime
-ENV NVIDIA_VISIBLE_DEVICES all
-ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
-ENV NVIDIA_REQUIRE_CUDA "cuda>=9.0"
-
-## FROM CUDA 9.0 runtime
-
-ENV NCCL_VERSION 2.4.2
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        cuda-libraries-$CUDA_PKG_VERSION \
-        cuda-cublas-9-0=9.0.176.4-1 \
-        libnccl2=$NCCL_VERSION-1+cuda9.0 && \
-    apt-mark hold libnccl2 && \
-    rm -rf /var/lib/apt/lists/*
-
-## FROM CUDA 9.0 devel
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        cuda-libraries-dev-$CUDA_PKG_VERSION \
-        cuda-nvml-dev-$CUDA_PKG_VERSION \
-        cuda-minimal-build-$CUDA_PKG_VERSION \
-        cuda-command-line-tools-$CUDA_PKG_VERSION \
-        cuda-core-9-0=9.0.176.3-1 \
-        cuda-cublas-dev-9-0=9.0.176.4-1 \
-        libnccl-dev=$NCCL_VERSION-1+cuda9.0 && \
-    rm -rf /var/lib/apt/lists/*
-
-ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs
-
-## FROM CUDA 9.0 CUDNN 7 devel
-
-ENV CUDNN_VERSION 7.4.2.24
-LABEL com.nvidia.cudnn.version="${CUDNN_VERSION}"
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-            libcudnn7=$CUDNN_VERSION-1+cuda9.0 \
-            libcudnn7-dev=$CUDNN_VERSION-1+cuda9.0 && \
-    apt-mark hold libcudnn7 && \
-    rm -rf /var/lib/apt/lists/*
-
diff --git a/docker/prebuilt/devel/gpu/9.1/Dockerfile b/docker/prebuilt/devel/gpu/9.1/Dockerfile
deleted file mode 100644
index 5f0b2c62e60..00000000000
--- a/docker/prebuilt/devel/gpu/9.1/Dockerfile
+++ /dev/null
@@ -1,72 +0,0 @@
-ARG FROM_TAG
-FROM espnet/espnet:${FROM_TAG}
-LABEL maintainer "Nelson Yalta <nyalta21@gmail.com>"
-
-## FROM CUDA 9.1 base
-
-RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates apt-transport-https gnupg-curl && \
-    rm -rf /var/lib/apt/lists/* && \
-    NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \
-    NVIDIA_GPGKEY_FPR=ae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80 && \
-    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub && \
-    apt-key adv --export --no-emit-version -a $NVIDIA_GPGKEY_FPR | tail -n +5 > cudasign.pub && \
-    echo "$NVIDIA_GPGKEY_SUM  cudasign.pub" | sha256sum -c --strict - && rm cudasign.pub && \
-    echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \
-    echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list
-
-ENV CUDA_VERSION 9.1.85
-
-ENV CUDA_PKG_VERSION 9-1=$CUDA_VERSION-1
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        cuda-cudart-$CUDA_PKG_VERSION && \
-    ln -s cuda-9.1 /usr/local/cuda && \
-    rm -rf /var/lib/apt/lists/*
-
-# nvidia-docker 1.0
-LABEL com.nvidia.volumes.needed="nvidia_driver"
-LABEL com.nvidia.cuda.version="${CUDA_VERSION}"
-
-RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \
-    echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf
-
-ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH}
-ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64
-ENV CUDA_HOME /usr/local/cuda
-
-# nvidia-container-runtime
-ENV NVIDIA_VISIBLE_DEVICES all
-ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
-ENV NVIDIA_REQUIRE_CUDA "cuda>=9.1"
-
-## FROM CUDA 9.1 runtime
-
-ENV NCCL_VERSION 2.2.12
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        cuda-libraries-$CUDA_PKG_VERSION \
-        libnccl2=$NCCL_VERSION-1+cuda9.1 && \
-    apt-mark hold libnccl2 && \
-    rm -rf /var/lib/apt/lists/*
-
-## FROM CUDA 9.1 devel
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        cuda-libraries-dev-$CUDA_PKG_VERSION \
-        cuda-nvml-dev-$CUDA_PKG_VERSION \
-        cuda-minimal-build-$CUDA_PKG_VERSION \
-        cuda-command-line-tools-$CUDA_PKG_VERSION \
-        libnccl-dev=$NCCL_VERSION-1+cuda9.1 && \
-    rm -rf /var/lib/apt/lists/*
-
-ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs
-
-## FROM CUDA 9.1 CUDNN 7
-
-ENV CUDNN_VERSION 7.1.2.21
-LABEL com.nvidia.cudnn.version="${CUDNN_VERSION}"
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-            libcudnn7=$CUDNN_VERSION-1+cuda9.1 \
-            libcudnn7-dev=$CUDNN_VERSION-1+cuda9.1 && \
-    apt-mark hold libcudnn7 && \
-    rm -rf /var/lib/apt/lists/*
diff --git a/docker/prebuilt/devel/gpu/9.2/Dockerfile b/docker/prebuilt/devel/gpu/9.2/Dockerfile
deleted file mode 100644
index 14a089b6d34..00000000000
--- a/docker/prebuilt/devel/gpu/9.2/Dockerfile
+++ /dev/null
@@ -1,73 +0,0 @@
-ARG FROM_TAG
-FROM espnet/espnet:${FROM_TAG}
-LABEL maintainer "Nelson Yalta <nyalta21@gmail.com>"
-
-## FROM CUDA 9.2 base [https://gitlab.com/nvidia/cuda/blob/ubuntu18.04/9.2/base/Dockerfile]
-# CUDA 9.2 is not officially supported on ubuntu 18.04 yet, the ubuntu 17.10 repository for CUDA were used instead.
-RUN apt-get update && apt-get install -y --no-install-recommends gnupg2 curl ca-certificates && \
-    curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1710/x86_64/7fa2af80.pub | apt-key add - && \
-    echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1710/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \
-    echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list && \
-    apt-get purge --autoremove -y curl && \
-    rm -rf /var/lib/apt/lists/*
-
-ENV CUDA_VERSION 9.2.148
-
-ENV CUDA_PKG_VERSION 9-2=$CUDA_VERSION-1
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        cuda-cudart-$CUDA_PKG_VERSION && \
-    ln -s cuda-9.2 /usr/local/cuda && \
-    rm -rf /var/lib/apt/lists/*
-
-# nvidia-docker 1.0
-LABEL com.nvidia.volumes.needed="nvidia_driver"
-LABEL com.nvidia.cuda.version="${CUDA_VERSION}"
-
-RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \
-    echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf
-
-ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH}
-ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64
-
-# nvidia-container-runtime
-ENV NVIDIA_VISIBLE_DEVICES all
-ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
-ENV NVIDIA_REQUIRE_CUDA "cuda>=9.2"
-
-ENV CUDA_HOME /usr/local/cuda
-
-## FROM CUDA 9.2 runtime
-
-ENV NCCL_VERSION 2.3.7
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        cuda-libraries-$CUDA_PKG_VERSION \
-        cuda-nvtx-$CUDA_PKG_VERSION \
-        libnccl2=$NCCL_VERSION-1+cuda9.2 && \
-    apt-mark hold libnccl2 && \
-    rm -rf /var/lib/apt/lists/*
-
-## FROM CUDA 9.2 devel [https://gitlab.com/nvidia/cuda/blob/ubuntu18.04/9.2/devel/Dockerfile]
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        cuda-libraries-dev-$CUDA_PKG_VERSION \
-        cuda-nvml-dev-$CUDA_PKG_VERSION \
-        cuda-minimal-build-$CUDA_PKG_VERSION \
-        cuda-command-line-tools-$CUDA_PKG_VERSION \
-        libnccl-dev=$NCCL_VERSION-1+cuda9.2 && \
-    rm -rf /var/lib/apt/lists/*
-
-ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs
-
-## FROM CUDA 9.2-CUDNN 7 devel [https://gitlab.com/nvidia/cuda/blob/ubuntu18.04/9.2/devel/cudnn7/Dockerfile]
-
-ENV CUDNN_VERSION 7.5.0.56
-LABEL com.nvidia.cudnn.version="${CUDNN_VERSION}"
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-            libcudnn7=$CUDNN_VERSION-1+cuda9.2 \
-            libcudnn7-dev=$CUDNN_VERSION-1+cuda9.2 && \
-    apt-mark hold libcudnn7 && \
-    rm -rf /var/lib/apt/lists/*
-
-WORKDIR /
\ No newline at end of file
diff --git a/docker/prebuilt/devel/gpu/11.1/Dockerfile b/docker/prebuilt/gpu.dockerfile
similarity index 96%
rename from docker/prebuilt/devel/gpu/11.1/Dockerfile
rename to docker/prebuilt/gpu.dockerfile
index d49660f50a6..a94504dc52c 100644
--- a/docker/prebuilt/devel/gpu/11.1/Dockerfile
+++ b/docker/prebuilt/gpu.dockerfile
@@ -1,5 +1,7 @@
 ARG FROM_TAG
-FROM espnet/espnet:${FROM_TAG}
+ARG NUM_BUILD_CORES=8
+ARG DOCKER_VER
+FROM espnet/espnet:${FROM_TAG} AS cuda_builder
 LABEL maintainer "Nelson Yalta <nyalta21@gmail.com>"
 
 ## FROM CUDA 11.1 base 
@@ -55,5 +57,4 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
 RUN apt-mark hold libcublas-dev-11-1
 ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs
 
-
 WORKDIR /
diff --git a/docker/prebuilt/local/Dockerfile b/docker/prebuilt/local/Dockerfile
deleted file mode 100644
index 15939185aff..00000000000
--- a/docker/prebuilt/local/Dockerfile
+++ /dev/null
@@ -1,50 +0,0 @@
-ARG FROM_TAG
-FROM espnet/espnet:${FROM_TAG}
-LABEL maintainer "Nelson Yalta <nyalta21@gmail.com>"
-
-ARG CUDA_VER
-WORKDIR /
-
-# IF using a local ESPNet repository, a temporary file containing the ESPnet git repo is copied over
-ARG ESPNET_ARCHIVE=./espnet-local.tar
-COPY  ${ESPNET_ARCHIVE} /espnet-local.tar
-
-
-# Download ESPnet
-RUN echo "Getting ESPnet sources from local repository, in temporary file: " ${ESPNET_ARCHIVE}
-RUN mkdir /espnet
-RUN tar xf espnet-local.tar -C /espnet/
-RUN rm espnet-local.tar
-
-RUN cd espnet && \
-    rm -rf docker egs test utils
-
-# Install espnet
-WORKDIR /espnet/tools
-
-# Replace nvidia-smi for nvcc because docker does not load nvidia-smi
-RUN if [ -z "$( which nvcc )" ]; then \
-        echo "Build without CUDA" && \
-        MY_OPTS='CUPY_VERSION=""  TH_VERSION=1.6.0'; \
-    else \
-        echo "Build with CUDA" && \
-        # Disable cupy test
-        # Docker build does not load libcuda.so.1
-        # So, their checks on cuda packages are disabled.
-        sed -i '200s|install.py|install.py --no-cuda --no-cupy |' Makefile && \
-        export CFLAGS="-I${CUDA_HOME}/include ${CFLAGS}" && \
-        MY_OPTS="CUDA_VERSION=${CUDA_VER}" && \
-        . ./setup_cuda_env.sh /usr/local/cuda;  \
-    fi; \ 
-    if [ "${CUDA_VER}" = "10.1" ]; then \
-        # warpctc is not supported from Pytorch 1.3.1
-        MY_OPTS="${MY_OPTS} TH_VERSION=1.6.0";  \
-    fi; \
-    echo "Make with options ${MY_OPTS}" && \
-    ln -s /kaldi ./ && \
-    ./setup_anaconda.sh /miniconda espnet 3.7.4 && \
-    make KALDI=/kaldi ${MY_OPTS}
-
-RUN rm -rf ../espnet
-
-WORKDIR /
diff --git a/docker/prebuilt/runtime/Dockerfile b/docker/prebuilt/runtime.dockerfile
similarity index 82%
rename from docker/prebuilt/runtime/Dockerfile
rename to docker/prebuilt/runtime.dockerfile
index 86ac859a67a..5f54ed90c90 100644
--- a/docker/prebuilt/runtime/Dockerfile
+++ b/docker/prebuilt/runtime.dockerfile
@@ -1,11 +1,11 @@
 ARG FROM_TAG
-FROM ubuntu:${FROM_TAG}
+ARG NUM_BUILD_CORES=8
+ARG DOCKER_VER
+
+FROM ubuntu:${FROM_TAG} AS main_builder
 LABEL maintainer "Nelson Yalta <nyalta21@gmail.com>"
 
-ARG DOCKER_VER
 ENV DOCKER_BUILT_VER ${DOCKER_VER}
-
-ARG NUM_BUILD_CORES=8
 ENV NUM_BUILD_CORES ${NUM_BUILD_CORES}
 
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive \
@@ -49,19 +49,24 @@ RUN add-apt-repository ppa:git-core/ppa -y && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
-# # Using kaldi pre-built binaries
 RUN git clone --depth 1 https://github.com/kaldi-asr/kaldi /opt/kaldi
 
+RUN wget --tries=3 -nv "https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh" -O miniconda.sh && \
+    bash miniconda.sh -b -p /opt/miniconda && \
+    rm miniconda.sh
+
+WORKDIR /
+
+FROM main_builder AS espnet1
+# # Using kaldi pre-built binaries
 RUN cd /opt/kaldi/tools &&  \
     echo "" > extras/check_dependencies.sh && \
     chmod +x extras/check_dependencies.sh &&  \
     cd /opt/kaldi && \
-    wget --tries=3 https://github.com/espnet/kaldi-bin/releases/download/v0.0.1/ubuntu16-featbin.tar.gz && \
+    wget --tries=3 -nv https://github.com/espnet/kaldi-bin/releases/download/v0.0.1/ubuntu16-featbin.tar.gz && \
     tar -xf ./ubuntu16-featbin.tar.gz && \
     cp featbin/* src/featbin/ && \
     rm -rf featbin && \
     rm -f ubuntu16-featbin.tar.gz
 
-RUN wget --tries=3 "https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh" -O miniconda.sh && \
-    bash miniconda.sh -b -p /opt/miniconda && \
-    rm miniconda.sh
+WORKDIR /
diff --git a/docker/run.sh b/docker/run.sh
index b0fbcebd307..cff0d5604bc 100755
--- a/docker/run.sh
+++ b/docker/run.sh
@@ -116,8 +116,8 @@ if [ ${is_root} = false ]; then
         build_args="${build_args} --build-arg THIS_UID=${UID}"
         build_args="${build_args} --build-arg EXTRA_LIBS=${EXTRAS}"
 
-        echo "Now running docker build ${build_args} -f prebuilt/Dockerfile -t espnet/espnet:${container_tag} ."
-        (docker build ${build_args} -f prebuilt/Dockerfile -t  espnet/espnet:${container_tag} .) || exit 1
+        echo "Now running docker build ${build_args} -f espnet.dockerfile -t espnet/espnet:${container_tag} ."
+        (docker build ${build_args} -f espnet.dockerfile -t  espnet/espnet:${container_tag} .) || exit 1
     fi
 else
     container_tag=${from_tag}
diff --git a/egs/README.md b/egs/README.md
index 9cddf37df4e..78fa57049ae 100755
--- a/egs/README.md
+++ b/egs/README.md
@@ -8,6 +8,7 @@ See: https://espnet.github.io/espnet/tutorial.html
 | Directory name          | Corpus name                                                  | Task                                       | Language       | URL                                                          | Note                          |
 | ----------------------- | ------------------------------------------------------------ | ------------------------------------------ | -------------- | ------------------------------------------------------------ | ----------------------------- |
 ||||
+| aesrc2020               | Accented English Speech Recognition Challenge 2020           | ASR                                        | EN             | https://arxiv.org/abs/2102.10233                                   |                               |
 | aidatatang_200zh        | Aidatatang_200zh A free Chinese Mandarin speech corpus       | ASR                                        | ZH             | http://www.openslr.org/62/                                   |                               |
 | aishell                 | AISHELL-ASR0009-OS1 Open Source Mandarin Speech Corpus       | ASR                                        | ZH             | http://www.aishelltech.com/kysjcp                            |                               |
 | aishell2                | AISHELL-2 Open Source Mandarin Speech Corpus                 | ASR                                        | ZH             | http://www.aishelltech.com/aishell_2                                                         |
@@ -49,6 +50,8 @@ See: https://espnet.github.io/espnet/tutorial.html
 | librispeech             | LibriSpeech ASR corpus                                       | ASR                                        | EN             | http://www.openslr.org/12                                    |                               |
 | libritts                | LibriTTS: A Corpus Derived from LibriSpeech for Text-to-Speech | TTS                                      | EN             | http://www.openslr.org/60/                                   |                               |
 | ljspeech                | The LJ Speech Dataset                                        | TTS                                        | EN             | https://keithito.com/LJ-Speech-Dataset/                      |                               |
+| lrs2                     | The Lip Reading Sentences 2 Dataset                            | ASR                                       | ENG             | https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html                      |                               |
+| lrs                     | The Lip Reading Sentences 2 and 3 Dataset                            | AVSR                                       | ENG             | https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html  https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs3.html                     |                               |
 | m_ailabs                | The M-AILABS Speech Dataset                                  | TTS                                        | ~5 languages   | https://www.caito.de/2019/01/the-m-ailabs-speech-dataset/    |
 | mucs_2021               | MUCS 2021: MUltilingual and Code-Switching ASR Challenges for Low Resource Indian Languages   | ASR/Code Switching          | HI, MR, OR, TA, TE, GU, HI-EN, BN-EN | https://navana-tech.github.io/MUCS2021/data.html                    |                               |
 | mtedx                   | Multilingual TEDx | ASR/Machine Translation/Speech Translation | 13 Language pairs | http://www.openslr.org/100/                         |
diff --git a/egs/aesrc2020/asr1/RESULTS.md b/egs/aesrc2020/asr1/RESULTS.md
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs/aesrc2020/asr1/cmd.sh b/egs/aesrc2020/asr1/cmd.sh
new file mode 100644
index 00000000000..7b70ef5e06e
--- /dev/null
+++ b/egs/aesrc2020/asr1/cmd.sh
@@ -0,0 +1,89 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/gpu.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs/aesrc2020/asr1/conf/decode.yaml b/egs/aesrc2020/asr1/conf/decode.yaml
new file mode 120000
index 00000000000..1f358f011d4
--- /dev/null
+++ b/egs/aesrc2020/asr1/conf/decode.yaml
@@ -0,0 +1 @@
+tuning/decode_pytorch_transformer.yaml
\ No newline at end of file
diff --git a/egs/aesrc2020/asr1/conf/fbank.conf b/egs/aesrc2020/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs/aesrc2020/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs/aesrc2020/asr1/conf/gpu.conf b/egs/aesrc2020/asr1/conf/gpu.conf
new file mode 100644
index 00000000000..6d0a75b067a
--- /dev/null
+++ b/egs/aesrc2020/asr1/conf/gpu.conf
@@ -0,0 +1,10 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0
+option gpu=* -l 'hostname=b1[12345678]*|c*,gpu=$0' -q g.q
\ No newline at end of file
diff --git a/egs/aesrc2020/asr1/conf/lm.yaml b/egs/aesrc2020/asr1/conf/lm.yaml
new file mode 100644
index 00000000000..ea738c16807
--- /dev/null
+++ b/egs/aesrc2020/asr1/conf/lm.yaml
@@ -0,0 +1,8 @@
+# rnnlm related
+layer: 2
+unit: 650
+opt: sgd        # or adam
+batchsize: 64   # batch size in LM training
+epoch: 20      # if the data size is large, we can reduce this
+patience: 3
+maxlen: 100     # if sentence length > lm_maxlen, lm_batchsize is automatically reduced
diff --git a/egs/aesrc2020/asr1/conf/pitch.conf b/egs/aesrc2020/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs/aesrc2020/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs/aesrc2020/asr1/conf/queue.conf b/egs/aesrc2020/asr1/conf/queue.conf
new file mode 100644
index 00000000000..257d7b7b3aa
--- /dev/null
+++ b/egs/aesrc2020/asr1/conf/queue.conf
@@ -0,0 +1,10 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs/aesrc2020/asr1/conf/slurm.conf b/egs/aesrc2020/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs/aesrc2020/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs/aesrc2020/asr1/conf/specaug.yaml b/egs/aesrc2020/asr1/conf/specaug.yaml
new file mode 100644
index 00000000000..3351630d2f3
--- /dev/null
+++ b/egs/aesrc2020/asr1/conf/specaug.yaml
@@ -0,0 +1,16 @@
+process:
+  # these three processes are a.k.a. SpecAugument
+  - type: "time_warp"
+    max_time_warp: 5
+    inplace: true
+    mode: "PIL"
+  - type: "freq_mask"
+    F: 30
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+  - type: "time_mask"
+    T: 40
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
\ No newline at end of file
diff --git a/egs/aesrc2020/asr1/conf/train.yaml b/egs/aesrc2020/asr1/conf/train.yaml
new file mode 120000
index 00000000000..5e11a9c3db2
--- /dev/null
+++ b/egs/aesrc2020/asr1/conf/train.yaml
@@ -0,0 +1 @@
+tuning/train_pytorch_conformer_kernel15.yaml
\ No newline at end of file
diff --git a/egs/aesrc2020/asr1/conf/tuning/decode_pytorch_transformer.yaml b/egs/aesrc2020/asr1/conf/tuning/decode_pytorch_transformer.yaml
new file mode 100644
index 00000000000..2ece5128686
--- /dev/null
+++ b/egs/aesrc2020/asr1/conf/tuning/decode_pytorch_transformer.yaml
@@ -0,0 +1,8 @@
+batchsize: 0
+beam-size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc-weight: 0.5
+lm-weight: 0.3
+ngram-weight: 0.3
diff --git a/egs/aesrc2020/asr1/conf/tuning/decode_rnn.yaml b/egs/aesrc2020/asr1/conf/tuning/decode_rnn.yaml
new file mode 100644
index 00000000000..739044dce1a
--- /dev/null
+++ b/egs/aesrc2020/asr1/conf/tuning/decode_rnn.yaml
@@ -0,0 +1,6 @@
+beam-size: 20
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc-weight: 0.6
+lm-weight: 0.3
diff --git a/egs/aesrc2020/asr1/conf/tuning/train_pytorch_conformer_kernel15.yaml b/egs/aesrc2020/asr1/conf/tuning/train_pytorch_conformer_kernel15.yaml
new file mode 100644
index 00000000000..8769ba67139
--- /dev/null
+++ b/egs/aesrc2020/asr1/conf/tuning/train_pytorch_conformer_kernel15.yaml
@@ -0,0 +1,47 @@
+# network architecture
+# encoder related
+elayers: 12
+eunits: 2048
+# decoder related
+dlayers: 6
+dunits: 2048
+# attention related
+adim: 256
+aheads: 4
+
+# hybrid CTC/attention
+mtlalpha: 0.3
+
+# label smoothing
+lsm-weight: 0.1
+
+# minibatch related
+batch-size: 32
+maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
+
+# optimization related
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+opt: noam
+accum-grad: 2
+grad-clip: 5
+patience: 0
+epochs: 50
+dropout-rate: 0.1
+
+# transformer specific setting
+backend: pytorch
+model-module: "espnet.nets.pytorch_backend.e2e_asr_conformer:E2E"
+transformer-input-layer: conv2d     # encoder architecture type
+transformer-lr: 1.0
+transformer-warmup-steps: 25000
+transformer-attn-dropout-rate: 0.0
+transformer-length-normalized-loss: false
+transformer-init: pytorch
+
+# conformer specific setting
+transformer-encoder-pos-enc-layer-type: rel_pos
+transformer-encoder-selfattn-layer-type: rel_selfattn
+macaron-style: true
+use-cnn-module: true
+cnn-module-kernel: 15
diff --git a/egs/aesrc2020/asr1/conf/tuning/train_pytorch_conformer_kernel31.yaml b/egs/aesrc2020/asr1/conf/tuning/train_pytorch_conformer_kernel31.yaml
new file mode 100644
index 00000000000..50d44abb5ab
--- /dev/null
+++ b/egs/aesrc2020/asr1/conf/tuning/train_pytorch_conformer_kernel31.yaml
@@ -0,0 +1,47 @@
+# network architecture
+# encoder related
+elayers: 12
+eunits: 2048
+# decoder related
+dlayers: 6
+dunits: 2048
+# attention related
+adim: 256
+aheads: 4
+
+# hybrid CTC/attention
+mtlalpha: 0.3
+
+# label smoothing
+lsm-weight: 0.1
+
+# minibatch related
+batch-size: 32
+maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
+
+# optimization related
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+opt: noam
+accum-grad: 2
+grad-clip: 5
+patience: 0
+epochs: 50
+dropout-rate: 0.1
+
+# transformer specific setting
+backend: pytorch
+model-module: "espnet.nets.pytorch_backend.e2e_asr_conformer:E2E"
+transformer-input-layer: conv2d     # encoder architecture type
+transformer-lr: 1.0
+transformer-warmup-steps: 25000
+transformer-attn-dropout-rate: 0.0
+transformer-length-normalized-loss: false
+transformer-init: pytorch
+
+# conformer specific setting
+transformer-encoder-pos-enc-layer-type: rel_pos
+transformer-encoder-selfattn-layer-type: rel_selfattn
+macaron-style: true
+use-cnn-module: true
+cnn-module-kernel: 31
diff --git a/egs/aesrc2020/asr1/conf/tuning/train_pytorch_transformer.yaml b/egs/aesrc2020/asr1/conf/tuning/train_pytorch_transformer.yaml
new file mode 100644
index 00000000000..4dd0b4e8247
--- /dev/null
+++ b/egs/aesrc2020/asr1/conf/tuning/train_pytorch_transformer.yaml
@@ -0,0 +1,40 @@
+# network architecture
+# encoder related
+elayers: 12
+eunits: 2048
+# decoder related
+dlayers: 6
+dunits: 2048
+# attention related
+adim: 256
+aheads: 4
+
+# hybrid CTC/attention
+mtlalpha: 0.3
+
+# label smoothing
+lsm-weight: 0.1
+
+# minibatch related
+batch-size: 32
+maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
+
+# optimization related
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+opt: noam
+accum-grad: 2
+grad-clip: 5
+patience: 0
+epochs: 50
+dropout-rate: 0.1
+
+# transformer specific setting
+backend: pytorch
+model-module: "espnet.nets.pytorch_backend.e2e_asr_transformer:E2E"
+transformer-input-layer: conv2d     # encoder architecture type
+transformer-lr: 2.0
+transformer-warmup-steps: 25000
+transformer-attn-dropout-rate: 0.0
+transformer-length-normalized-loss: false
+transformer-init: pytorch
diff --git a/egs/aesrc2020/asr1/conf/tuning/train_rnn.yaml b/egs/aesrc2020/asr1/conf/tuning/train_rnn.yaml
new file mode 100644
index 00000000000..ca5e99fa320
--- /dev/null
+++ b/egs/aesrc2020/asr1/conf/tuning/train_rnn.yaml
@@ -0,0 +1,31 @@
+# network architecture
+# encoder related
+etype: vggblstm     # encoder architecture type
+elayers: 3
+eunits: 1024
+eprojs: 1024
+subsample: "1_2_2_1_1" # skip every n frame from input to nth layers
+# decoder related
+dlayers: 2
+dunits: 1024
+# attention related
+atype: location
+adim: 1024
+aconv-chans: 10
+aconv-filts: 100
+
+# hybrid CTC/attention
+mtlalpha: 0.5
+
+# minibatch related
+batch-size: 30
+maxlen-in: 800  # if input length  > maxlen_in, batchsize is automatically reduced
+maxlen-out: 150 # if output length > maxlen_out, batchsize is automatically reduced
+
+# optimization related
+opt: adadelta
+epochs: 10
+patience: 0
+
+# scheduled sampling option
+sampling-probability: 0.0
diff --git a/egs/aesrc2020/asr1/local/create_subsets.sh b/egs/aesrc2020/asr1/local/create_subsets.sh
new file mode 100755
index 00000000000..f2667260c7b
--- /dev/null
+++ b/egs/aesrc2020/asr1/local/create_subsets.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+
+data=$1     # data transformed into kaldi format
+
+ # divide development set for cross validation
+ if [ -d ${data} ];then
+     for i in US UK IND CHN JPN PT RU KR CA ES;do
+         ./utils/subset_data_dir.sh --spk-list local/files/cvlist/${i}_cv_spk $data/data_all $data/cv/$i
+         cat $data/cv/$i/feats.scp >> $data/cv.scp
+     done
+     ./utils/filter_scp.pl --exclude $data/cv.scp $data/data_all/feats.scp > $data/train_and_dev.scp
+     #95-5 split for dev set
+     sed -n '0~20p' $data/train_and_dev.scp > $data/dev.scp
+     ./utils/filter_scp.pl --exclude $data/dev.scp $data/train_and_dev.scp > $data/train.scp
+     ./utils/subset_data_dir.sh --utt-list $data/train.scp $data/data_all $data/train_org
+     ./utils/subset_data_dir.sh --utt-list $data/dev.scp $data/data_all $data/dev_org
+     ./utils/subset_data_dir.sh --utt-list $data/cv.scp $data/data_all $data/cv_all
+ fi
+
+echo "local/subset_data.sh succeeded"
+exit 0;
diff --git a/egs/aesrc2020/asr1/local/data_prep.sh b/egs/aesrc2020/asr1/local/data_prep.sh
new file mode 100755
index 00000000000..4d5b26bd217
--- /dev/null
+++ b/egs/aesrc2020/asr1/local/data_prep.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+# Copyright 2020 Audio, Speech and Language Processing Group @ NWPU (Author: Xian Shi)
+# Apache 2.0
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+
+raw_data=$1     # raw data with metadata, txt and wav
+data=$2         # data transformed into kaldi format
+
+# generate kaldi format data for all
+if [ -d ${raw_data} ];then 
+    echo "Generating kaldi format data."
+    mkdir -p $data/data_all
+    find $raw_data -type f -name "*.wav" > $data/data_all/wavpath
+    awk -F'/' '{print $(NF-2)"-"$(NF-1)"-"$NF}' $data/data_all/wavpath | sed 's:\.wav::g' > $data/data_all/uttlist
+    paste $data/data_all/uttlist $data/data_all/wavpath > $data/data_all/wav.scp
+    python local/preprocess.py $data/data_all/wav.scp $data/data_all/trans $data/data_all/utt2spk # faster than for in shell
+    ./utils/utt2spk_to_spk2utt.pl $data/data_all/utt2spk > $data/data_all/spk2utt
+fi
+
+# clean transcription
+if [ -d $data/data_all ];then
+    echo "Cleaning transcription."
+    tr '[a-z]' '[A-Z]' < $data/data_all/trans > $data/data_all/trans_upper
+    # turn "." in specific abbreviations into "<m>" tag
+    sed -i -e 's: MR\.: MR<m>:g' -e 's: MRS\.: MRS<m>:g' -e 's: MS\.: MS<m>:g' \
+        -e 's:^MR\.:MR<m>:g' -e 's:^MRS\.:MRS<m>:g' -e 's:^MS\.:MS<m>:g' $data/data_all/trans_upper 
+	# fix bug
+    sed -i 's:^ST\.:STREET:g' $data/data_all/trans_upper 
+    sed -i 's: ST\.: STREET:g' $data/data_all/trans_upper 
+    # punctuation marks
+    sed -i "s%,\|\.\|?\|!\|;\|-\|:\|,'\|\.'\|?'\|!'\| '% %g" $data/data_all/trans_upper
+    sed -i 's:<m>:.:g' $data/data_all/trans_upper
+    # blank
+    sed -i 's:[ ][ ]*: :g' $data/data_all/trans_upper
+    paste $data/data_all/uttlist $data/data_all/trans_upper > $data/data_all/text
+
+    # critally, must replace tab with space between uttid and text
+    sed -e "s/\t/ /g" -i $data/data_all/text
+fi
+
+echo "local/data_prep.sh succeeded"
+exit 0;
diff --git a/egs/aesrc2020/asr1/local/download_and_untar.sh b/egs/aesrc2020/asr1/local/download_and_untar.sh
new file mode 100755
index 00000000000..046ce35bb1b
--- /dev/null
+++ b/egs/aesrc2020/asr1/local/download_and_untar.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+
+zipped_data=$1
+raw_data=$2/Datatang-English/data
+
+# unzip and rename each accent
+unzip $zipped_data -d ${2}
+mv $raw_data/American\ English\ Speech\ Data $raw_data/US
+mv $raw_data/British\ English\ Speech\ Data $raw_data/UK
+mv $raw_data/Chinese\ Speaking\ English\ Speech\ Data $raw_data/CHN 
+mv $raw_data/Indian\ English\ Speech\ Data $raw_data/IND 
+mv $raw_data/Portuguese\ Speaking\ English\ Speech\ Data $raw_data/PT 
+mv $raw_data/Russian\ Speaking\ English\ Speech\ Data $raw_data/RU 
+mv $raw_data/Japanese\ Speaking\ English\ Speech\ Data $raw_data/JPN 
+mv $raw_data/Korean\ Speaking\ English\ Speech\ Data $raw_data/KR
+mv $raw_data/Canadian\ English\ Speech\ Data $raw_data/CA 
+mv $raw_data/Spanish\ Speaking\ English\ Speech\ Data $raw_data/ES
+
+echo "local/download_and_untar.sh succeeded"
+exit 0;
diff --git a/egs/aesrc2020/asr1/local/files/ar.dict b/egs/aesrc2020/asr1/local/files/ar.dict
new file mode 100644
index 00000000000..d17cfb0a5e0
--- /dev/null
+++ b/egs/aesrc2020/asr1/local/files/ar.dict
@@ -0,0 +1,8 @@
+<US> 0
+<UK> 1
+<CHN> 2
+<IND> 3
+<JPN> 4
+<KR> 5
+<PT> 6
+<RU> 7
diff --git a/egs/aesrc2020/asr1/local/files/cvlist/CA_cv_spk b/egs/aesrc2020/asr1/local/files/cvlist/CA_cv_spk
new file mode 100644
index 00000000000..9362f7dc693
--- /dev/null
+++ b/egs/aesrc2020/asr1/local/files/cvlist/CA_cv_spk
@@ -0,0 +1,4 @@
+CA-G00034
+CA-G00086
+CA-G00414
+CA-G20113
diff --git a/egs/aesrc2020/asr1/local/files/cvlist/CHN_cv_spk b/egs/aesrc2020/asr1/local/files/cvlist/CHN_cv_spk
new file mode 100644
index 00000000000..f5ed8b6241c
--- /dev/null
+++ b/egs/aesrc2020/asr1/local/files/cvlist/CHN_cv_spk
@@ -0,0 +1,4 @@
+CHN-G00190
+CHN-G00992
+CHN-G61365
+CHN-G01372
\ No newline at end of file
diff --git a/egs/aesrc2020/asr1/local/files/cvlist/ES_cv_spk b/egs/aesrc2020/asr1/local/files/cvlist/ES_cv_spk
new file mode 100644
index 00000000000..509dd652f44
--- /dev/null
+++ b/egs/aesrc2020/asr1/local/files/cvlist/ES_cv_spk
@@ -0,0 +1,4 @@
+ES-G00714
+ES-G01878
+ES-G11701
+ES-G20575
diff --git a/egs/aesrc2020/asr1/local/files/cvlist/IND_cv_spk b/egs/aesrc2020/asr1/local/files/cvlist/IND_cv_spk
new file mode 100644
index 00000000000..72b5df67cf8
--- /dev/null
+++ b/egs/aesrc2020/asr1/local/files/cvlist/IND_cv_spk
@@ -0,0 +1,4 @@
+IND-G00892
+IND-G01006
+IND-G01501
+IND-G0760
\ No newline at end of file
diff --git a/egs/aesrc2020/asr1/local/files/cvlist/JPN_cv_spk b/egs/aesrc2020/asr1/local/files/cvlist/JPN_cv_spk
new file mode 100644
index 00000000000..957a43af30b
--- /dev/null
+++ b/egs/aesrc2020/asr1/local/files/cvlist/JPN_cv_spk
@@ -0,0 +1,4 @@
+JPN-G00040
+JPN-G00125
+JPN-G00354
+JPN-G20194
\ No newline at end of file
diff --git a/egs/aesrc2020/asr1/local/files/cvlist/KR_cv_spk b/egs/aesrc2020/asr1/local/files/cvlist/KR_cv_spk
new file mode 100644
index 00000000000..0e078514d72
--- /dev/null
+++ b/egs/aesrc2020/asr1/local/files/cvlist/KR_cv_spk
@@ -0,0 +1,4 @@
+KR-G00022
+KR-G00276
+KR-G10029
+KR-G10122
\ No newline at end of file
diff --git a/egs/aesrc2020/asr1/local/files/cvlist/PT_cv_spk b/egs/aesrc2020/asr1/local/files/cvlist/PT_cv_spk
new file mode 100644
index 00000000000..89f09e4756e
--- /dev/null
+++ b/egs/aesrc2020/asr1/local/files/cvlist/PT_cv_spk
@@ -0,0 +1,5 @@
+PT-G00600
+PT-G00643
+PT-G00963
+PT-G10618
+PT-G20539
\ No newline at end of file
diff --git a/egs/aesrc2020/asr1/local/files/cvlist/RU_cv_spk b/egs/aesrc2020/asr1/local/files/cvlist/RU_cv_spk
new file mode 100644
index 00000000000..3069b2e4f6f
--- /dev/null
+++ b/egs/aesrc2020/asr1/local/files/cvlist/RU_cv_spk
@@ -0,0 +1,4 @@
+RU-G00163
+RU-G00196
+RU-G00439
+RU-G10416
\ No newline at end of file
diff --git a/egs/aesrc2020/asr1/local/files/cvlist/UK_cv_spk b/egs/aesrc2020/asr1/local/files/cvlist/UK_cv_spk
new file mode 100644
index 00000000000..fe7cd8b43cd
--- /dev/null
+++ b/egs/aesrc2020/asr1/local/files/cvlist/UK_cv_spk
@@ -0,0 +1,8 @@
+UK-G00025
+UK-G00808
+UK-G01337
+UK-G01807
+UK-G10261
+UK-G11032
+UK-G11739
+UK-G40517
\ No newline at end of file
diff --git a/egs/aesrc2020/asr1/local/files/cvlist/US_cv_spk b/egs/aesrc2020/asr1/local/files/cvlist/US_cv_spk
new file mode 100644
index 00000000000..760edcea2a0
--- /dev/null
+++ b/egs/aesrc2020/asr1/local/files/cvlist/US_cv_spk
@@ -0,0 +1,6 @@
+US-G00007
+US-G01459
+US-G10948
+US-G20537
+US-G20939
+US-G30201
\ No newline at end of file
diff --git a/egs/aesrc2020/asr1/local/preprocess.py b/egs/aesrc2020/asr1/local/preprocess.py
new file mode 100755
index 00000000000..f5939848f4e
--- /dev/null
+++ b/egs/aesrc2020/asr1/local/preprocess.py
@@ -0,0 +1,18 @@
+# Copyright 2020 Audio, Speech and Language Processing Group @ NWPU (Author: Xian Shi)
+# Apache 2.0
+
+import sys
+
+fin = open(sys.argv[1], "r")
+fout_text = open(sys.argv[2], "w")
+fout_utt2spk = open(sys.argv[3], "w")
+
+for line in fin.readlines():
+    uttid, path = line.strip("\n").split("\t")
+    text_path = path.replace(".wav", ".txt")
+    text_ori = open(text_path, "r").readlines()[0].strip("\n")
+    feild = path.split("/")
+    accid = feild[-3]
+    spkid = accid + "-" + feild[-2]
+    fout_utt2spk.write(uttid + "\t" + spkid + "\n")
+    fout_text.write(text_ori + "\n")
diff --git a/egs/aesrc2020/asr1/path.sh b/egs/aesrc2020/asr1/path.sh
new file mode 100644
index 00000000000..d405bf59826
--- /dev/null
+++ b/egs/aesrc2020/asr1/path.sh
@@ -0,0 +1,17 @@
+MAIN_ROOT=$PWD/../../..
+KALDI_ROOT=$MAIN_ROOT/tools/kaldi
+
+
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$MAIN_ROOT/tools/chainer_ctc/ext/warp-ctc/build
+. "${MAIN_ROOT}"/tools/activate_python.sh && . "${MAIN_ROOT}"/tools/extra_path.sh
+export PATH=$MAIN_ROOT/utils:$MAIN_ROOT/espnet/bin:$PATH
+
+export OMP_NUM_THREADS=1
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
diff --git a/egs/aesrc2020/asr1/run.sh b/egs/aesrc2020/asr1/run.sh
new file mode 100755
index 00000000000..1cd0d51791f
--- /dev/null
+++ b/egs/aesrc2020/asr1/run.sh
@@ -0,0 +1,322 @@
+#!/usr/bin/env bash
+
+# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+
+# general configuration
+backend=pytorch
+stage=-1       # start from -1 if you need to start from data download
+stop_stage=100
+ngpu=8         # number of gpus ("0" uses cpu, otherwise use gpu)
+nj=32
+debugmode=1
+dumpdir=dump   # directory to dump full features
+N=0            # number of minibatches to be used (mainly for debugging). "0" uses all minibatches.
+verbose=0      # verbose option
+resume=        # Resume the training from snapshot
+
+# feature configuration
+do_delta=false
+
+preprocess_config=conf/specaug.yaml
+train_config=conf/train.yaml # current default recipe requires 4 gpus.
+                             # if you do not have 4 gpus, please reconfigure the `batch-bins` and `accum-grad` parameters in config.
+lm_config=conf/lm.yaml
+decode_config=conf/decode.yaml
+
+# rnnlm related
+lm_resume= # specify a snapshot file to resume LM training
+lmtag=     # tag for managing LMs
+
+# decoding parameter
+recog_model=model.acc.best  # set a model to be used for decoding: 'model.acc.best' or 'model.loss.best'
+lang_model=rnnlm.model.best # set a language model to be used for decoding
+
+# model average realted (only for transformer)
+n_average=5                  # the number of ASR models to be averaged
+use_valbest_average=true     # if true, the validation `n_average`-best ASR models will be averaged.
+                             # if false, the last `n_average` ASR models will be averaged.
+lm_n_average=0               # the number of languge models to be averaged
+use_lm_valbest_average=false # if true, the validation `lm_n_average`-best language models will be averaged.
+                             # if false, the last `lm_n_average` language models will be averaged.
+
+# Set this to somewhere where you want to put your data, or where
+# someone else has already put it.  You'll want to change this
+# if you're not on the CLSP grid.
+datadir=
+
+# The AESRC2020 data needs to be requested via services@datatang.com
+# The provided data will be a zip
+datazip=
+
+# bpemode (unigram or bpe)
+nbpe=5000
+bpemode=unigram
+
+# exp tag
+tag="" # tag for managing experiments.
+
+. utils/parse_options.sh || exit 1;
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train
+train_sp=train_sp
+train_dev=dev
+recog_set="US UK IND CHN JPN PT RU KR CA ES"
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    echo "stage -1: Data Download"
+    if [ ! -f ${datazip} ]; then
+        echo "The AESRC2020 data needs to be requested via services@datatang.com"
+        exit 1
+    fi
+    local/download_and_untar.sh ${datazip} ${datadir}
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    ### Task dependent. You have to make data the following preparation part by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+    echo "stage 0: Data preparation"
+    local/data_prep.sh $datadir/Datatang-English/data data
+    ./utils/fix_data_dir.sh data/data_all
+fi
+
+feat_tr_dir=${dumpdir}/${train_set}/delta${do_delta}; mkdir -p ${feat_tr_dir}
+feat_sp_dir=${dumpdir}/${train_sp}/delta${do_delta}; mkdir -p ${feat_sp_dir}
+feat_dt_dir=${dumpdir}/${train_dev}/delta${do_delta}; mkdir -p ${feat_dt_dir}
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    ### Task dependent. You have to design training and dev sets by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+    echo "stage 1: Feature Generation"
+    fbankdir=fbank
+    # Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame
+    steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj ${nj} --write_utt2num_frames true \
+        data/data_all exp/make_fbank/data_all ${fbankdir}
+    utils/fix_data_dir.sh data/data_all
+
+    # Data splits
+    local/create_subsets.sh data
+
+    utils/perturb_data_dir_speed.sh 0.9  data/${train_set}_org data/temp1
+    utils/perturb_data_dir_speed.sh 1.0  data/${train_set}_org data/temp2
+    utils/perturb_data_dir_speed.sh 1.1  data/${train_set}_org data/temp3
+
+    utils/combine_data.sh --extra-files utt2uniq data/${train_sp}_org data/temp1 data/temp2 data/temp3
+
+    # remove utt having more than 3000 frames
+    # remove utt having more than 400 characters
+    remove_longshortdata.sh --maxframes 3000 --maxchars 400 data/${train_set}_org data/${train_set}
+    remove_longshortdata.sh --maxframes 3000 --maxchars 400 data/${train_sp}_org data/${train_sp}
+    remove_longshortdata.sh --maxframes 3000 --maxchars 400 data/${train_dev}_org data/${train_dev}
+    steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj $nj  --write_utt2num_frames true \
+            data/$train_sp  exp/make_fbank/$train_sp  ${fbankdir}
+    rm data/train_sp/utt2dur    #hacked
+    utils/fix_data_dir.sh data/train_sp
+    # compute global CMVN
+    compute-cmvn-stats scp:data/${train_sp}/feats.scp data/${train_sp}/cmvn.ark
+ 
+    # dump features for training
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${feat_tr_dir}/storage ]; then
+    utils/create_split_dir.pl \
+        /export/b{14,15,16,17}/${USER}/espnet-data/egs/librispeech/asr1/dump/${train_set}/delta${do_delta}/storage \
+        ${feat_tr_dir}/storage
+    fi
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${feat_dt_dir}/storage ]; then
+    utils/create_split_dir.pl \
+        /export/b{14,15,16,17}/${USER}/espnet-data/egs/librispeech/asr1/dump/${train_dev}/delta${do_delta}/storage \
+        ${feat_dt_dir}/storage
+    fi
+    dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta ${do_delta} \
+        data/${train_sp}/feats.scp data/${train_sp}/cmvn.ark exp/dump_feats/$train_sp ${feat_sp_dir}
+    dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta ${do_delta} \
+        data/${train_dev}/feats.scp data/${train_sp}/cmvn.ark exp/dump_feats/data_all ${feat_dt_dir}
+    for rtask in ${recog_set}; do
+        feat_recog_dir=${dumpdir}/${rtask}/delta${do_delta}; mkdir -p ${feat_recog_dir}
+        dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta ${do_delta} \
+            data/cv/${rtask}/feats.scp data/${train_sp}/cmvn.ark exp/dump_feats/recog/data_all \
+            ${feat_recog_dir}
+    done
+fi
+
+dict=data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt
+bpemodel=data/lang_char/${train_set}_${bpemode}${nbpe}
+echo "dictionary: ${dict}"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    ### Task dependent. You have to check non-linguistic symbols used in the corpus.
+    echo "stage 2: Dictionary and Json Data Preparation"
+    mkdir -p data/lang_char/
+    echo "<unk> 1" > ${dict} # <unk> must be 1, 0 will be used for "blank" in CTC
+    cut -f 2- -d" " data/${train_set}/text > data/lang_char/input.txt
+    spm_train --input=data/lang_char/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000
+    spm_encode --model=${bpemodel}.model --output_format=piece < data/lang_char/input.txt | tr ' ' '\n' | sort | uniq | awk '{print $0 " " NR+1}' >> ${dict}
+    wc -l ${dict}
+
+    # make json labels
+    data2json.sh --nj ${nj} --feat ${feat_sp_dir}/feats.scp --bpecode ${bpemodel}.model \
+        data/${train_sp} ${dict} > ${feat_sp_dir}/data_${bpemode}${nbpe}.json
+    data2json.sh --nj ${nj} --feat ${feat_dt_dir}/feats.scp --bpecode ${bpemodel}.model \
+        data/${train_dev} ${dict} > ${feat_dt_dir}/data_${bpemode}${nbpe}.json
+
+    for rtask in ${recog_set}; do
+        feat_recog_dir=${dumpdir}/${rtask}/delta${do_delta}
+        data2json.sh --nj ${nj} --feat ${feat_recog_dir}/feats.scp --bpecode ${bpemodel}.model \
+            data/cv/${rtask} ${dict} > ${feat_recog_dir}/data_${bpemode}${nbpe}.json
+    done
+fi
+
+# You can skip this and remove --rnnlm option in the recognition (stage 5)
+if [ -z ${lmtag} ]; then
+    lmtag=$(basename ${lm_config%.*})
+fi
+lmexpname=train_rnnlm_${backend}_${lmtag}_${bpemode}${nbpe}_ngpu${ngpu}
+lmexpdir=exp/${lmexpname}
+mkdir -p ${lmexpdir}
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    echo "stage 3: LM Preparation"
+    lmdatadir=data/local/lm_train_${bpemode}${nbpe}
+    # use external data
+    if [ ! -e data/local/lm_train/librispeech-lm-norm.txt.gz ]; then
+        wget http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz -P data/local/lm_train/
+    fi
+    if [ ! -e ${lmdatadir} ]; then
+        mkdir -p ${lmdatadir}
+        cut -f 2- -d" " data/${train_set}/text | gzip -c > data/local/lm_train/${train_set}_text.gz
+        # combine external text and transcriptions and shuffle them with seed 777
+        zcat data/local/lm_train/librispeech-lm-norm.txt.gz data/local/lm_train/${train_set}_text.gz |\
+            spm_encode --model=${bpemodel}.model --output_format=piece > ${lmdatadir}/train.txt
+        cut -f 2- -d" " data/${train_dev}/text | spm_encode --model=${bpemodel}.model --output_format=piece \
+                                                            > ${lmdatadir}/valid.txt
+    fi
+    ${cuda_cmd} --gpu ${ngpu} ${lmexpdir}/train.log \
+        lm_train.py \
+        --config ${lm_config} \
+        --ngpu ${ngpu} \
+        --backend ${backend} \
+        --verbose 1 \
+        --outdir ${lmexpdir} \
+        --tensorboard-dir tensorboard/${lmexpname} \
+        --train-label ${lmdatadir}/train.txt \
+        --valid-label ${lmdatadir}/valid.txt \
+        --resume ${lm_resume} \
+        --dict ${dict} \
+        --dump-hdf5-path ${lmdatadir}
+fi
+
+if [ -z ${tag} ]; then
+    expname=${train_set}_${backend}_$(basename ${train_config%.*})
+    if ${do_delta}; then
+        expname=${expname}_delta
+    fi
+    if [ -n "${preprocess_config}" ]; then
+        expname=${expname}_$(basename ${preprocess_config%.*})
+    fi
+else
+    expname=${train_set}_${backend}_${tag}
+fi
+expdir=exp/${expname}
+mkdir -p ${expdir}
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    echo "stage 4: Network Training"
+    ${cuda_cmd} --gpu ${ngpu} ${expdir}/train.log \
+        asr_train.py \
+        --config ${train_config} \
+        --preprocess-conf ${preprocess_config} \
+        --ngpu ${ngpu} \
+        --backend ${backend} \
+        --outdir ${expdir}/results \
+        --tensorboard-dir tensorboard/${expname} \
+        --debugmode ${debugmode} \
+        --dict ${dict} \
+        --debugdir ${expdir} \
+        --minibatches ${N} \
+        --verbose ${verbose} \
+        --resume ${resume} \
+        --train-json ${feat_sp_dir}/data_${bpemode}${nbpe}.json \
+        --valid-json ${feat_dt_dir}/data_${bpemode}${nbpe}.json
+fi
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    echo "stage 5: Decoding"
+    if [[ $(get_yaml.py ${train_config} model-module) = *transformer* ]] || \
+           [[ $(get_yaml.py ${train_config} model-module) = *conformer* ]] || \
+           [[ $(get_yaml.py ${train_config} etype) = custom ]] || \
+           [[ $(get_yaml.py ${train_config} dtype) = custom ]]; then
+        # Average ASR models
+        if ${use_valbest_average}; then
+            recog_model=model.val${n_average}.avg.best
+            opt="--log ${expdir}/results/log"
+        else
+            recog_model=model.last${n_average}.avg.best
+            opt="--log"
+        fi
+        average_checkpoints.py \
+            ${opt} \
+            --backend ${backend} \
+            --snapshots ${expdir}/results/snapshot.ep.* \
+            --out ${expdir}/results/${recog_model} \
+            --num ${n_average}
+
+        # Average LM models
+        if [ ${lm_n_average} -eq 0 ]; then
+            lang_model=rnnlm.model.best
+        else
+            if ${use_lm_valbest_average}; then
+                lang_model=rnnlm.val${lm_n_average}.avg.best
+                opt="--log ${lmexpdir}/log"
+            else
+                lang_model=rnnlm.last${lm_n_average}.avg.best
+                opt="--log"
+            fi
+            average_checkpoints.py \
+                ${opt} \
+                --backend ${backend} \
+                --snapshots ${lmexpdir}/snapshot.ep.* \
+                --out ${lmexpdir}/${lang_model} \
+                --num ${lm_n_average}
+        fi
+    fi
+
+    pids=() # initialize pids
+    for rtask in ${recog_set}; do
+    (
+        decode_dir=decode_${rtask}_${recog_model}_$(basename ${decode_config%.*})_${lmtag}
+        feat_recog_dir=${dumpdir}/${rtask}/delta${do_delta}
+
+        # split data
+        splitjson.py --parts ${nj} ${feat_recog_dir}/data_${bpemode}${nbpe}.json
+
+        #### use CPU for decoding
+        ngpu=0
+
+        # set batchsize 0 to disable batch decoding
+        ${decode_cmd} JOB=1:${nj} ${expdir}/${decode_dir}/log/decode.JOB.log \
+            asr_recog.py \
+            --config ${decode_config} \
+            --ngpu ${ngpu} \
+            --backend ${backend} \
+            --batchsize 0 \
+            --recog-json ${feat_recog_dir}/split${nj}utt/data_${bpemode}${nbpe}.JOB.json \
+            --result-label ${expdir}/${decode_dir}/data.JOB.json \
+            --model ${expdir}/results/${recog_model}  \
+            --rnnlm ${lmexpdir}/${lang_model} \
+            --api v2
+
+        score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel}.model --wer true ${expdir}/${decode_dir} ${dict}
+
+    ) &
+    pids+=($!) # store background pids
+    done
+    i=0; for pid in "${pids[@]}"; do wait ${pid} || ((++i)); done
+    [ ${i} -gt 0 ] && echo "$0: ${i} background jobs are failed." && false
+    echo "Finished"
+fi
diff --git a/egs/aesrc2020/asr1/steps b/egs/aesrc2020/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs/aesrc2020/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs/aesrc2020/asr1/utils b/egs/aesrc2020/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs/aesrc2020/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs/commonvoice/asr1/local/download_and_untar.sh b/egs/commonvoice/asr1/local/download_and_untar.sh
index 1f5c40d9b0e..cce26302127 100755
--- a/egs/commonvoice/asr1/local/download_and_untar.sh
+++ b/egs/commonvoice/asr1/local/download_and_untar.sh
@@ -16,7 +16,7 @@ fi
 
 if [ $# -ne 3 ]; then
   echo "Usage: $0 [--remove-archive] <data-base> <url> <filename>"
-  echo "e.g.: $0 /export/data/ https://common-voice-data-download.s3.amazonaws.com/cv_corpus_v1.tar.gz cv_corpus_v1.tar.gz"
+  echo "e.g.: $0 /export/data/ https://us.openslr.org/resources/108/FR.tgz"
   echo "With --remove-archive it will remove the archive after successfully un-tarring it."
   exit 0;
 fi
diff --git a/egs/fisher_callhome_spanish/st1/local/normalize_trans.sh b/egs/fisher_callhome_spanish/st1/local/normalize_trans.sh
index a5024a462d7..646b5bbf213 100755
--- a/egs/fisher_callhome_spanish/st1/local/normalize_trans.sh
+++ b/egs/fisher_callhome_spanish/st1/local/normalize_trans.sh
@@ -14,7 +14,8 @@ if [ $# -lt 2 ]; then
 fi
 
 # download data preparation scripts for transcriptions
-[ ! -d data/local/fisher-callhome-corpus ] && git clone https://github.com/joshua-decoder/fisher-callhome-corpus.git data/local/fisher-callhome-corpus
+# Note uses fork because of utf-8 issues in iconv - https://github.com/joshua-decoder/fisher-callhome-corpus/pull/3
+[ ! -d data/local/fisher-callhome-corpus ] && git clone https://github.com/siddalmia/fisher-callhome-corpus.git data/local/fisher-callhome-corpus
 
 # create symbolic links
 cur_dir=$(pwd)
diff --git a/egs/iwslt21/asr1/local/filter_parentheses.py b/egs/iwslt21/asr1/local/filter_parentheses.py
index 6b3b83031a4..8c27bf39d27 100755
--- a/egs/iwslt21/asr1/local/filter_parentheses.py
+++ b/egs/iwslt21/asr1/local/filter_parentheses.py
@@ -19,8 +19,8 @@ def main():
     p_kanji = regex.compile(r".*\p{Script=Han}+.*")
     p_hiragana = regex.compile(r".*\p{Block=Hiragana}+.*")
     p_katakana = regex.compile(r".*\p{Block=Katakana}+.*")
-    p_chinese = re.compile(u".*[\u4e00-\u9fa5]+.*")
-    p_korean = re.compile(u".*[\uac00-\ud7ff]+.*")
+    p_chinese = re.compile(".*[\u4e00-\u9fa5]+.*")
+    p_korean = re.compile(".*[\uac00-\ud7ff]+.*")
     p_arabic = regex.compile(r".*\p{Block=Arabic}+.*")
     p_cyrillic = regex.compile(r".*\p{Block=Cyrillic}+.*")
     p_sanskrit = regex.compile(r".*\p{Block=Devanagari}+.*")
diff --git a/egs/libri_css/asr1/diarization/VB_diarization.py b/egs/libri_css/asr1/diarization/VB_diarization.py
index 1e8af2066e3..f53503ff2da 100644
--- a/egs/libri_css/asr1/diarization/VB_diarization.py
+++ b/egs/libri_css/asr1/diarization/VB_diarization.py
@@ -27,202 +27,286 @@
 import numpy as np
 from scipy.sparse import coo_matrix
 import scipy.linalg as spl
-import numexpr as ne # the dependency on this modul can be avoided by replacing
-                     # logsumexp_ne and exp_ne with logsumexp and np.exp
-
-#[gamma pi Li] =
-def VB_diarization(X, m, invSigma, w, V, pi=None, gamma=None,
-                   maxSpeakers = 10, maxIters = 10,
-                   epsilon = 1e-4, loopProb = 0.99, statScale = 1.0,
-                   alphaQInit = 1.0, downsample = None, VtinvSigmaV = None, ref=None,
-                   plot=False, sparsityThr=0.001, llScale=1.0, minDur=1, Fa=1.0, Fb=1.0):
-
-  """
-  This a generalized version of speaker diarization described in:
-
-  Diez. M., Burget. L., Landini. F., Cernocky. J.
-  Analysis of Speaker Diarization based on Bayesian HMM with Eigenvoice Priors
-
-  Variable names and equation numbers refer to those used the paper
-
-  Inputs:
-  X  - T x D array, where columns are D dimensional feature vectors for T frames
-  m  - C x D array of GMM component means
-  invSigma - C x D array of GMM component inverse covariance matrix diagonals
-  w  - C dimensional column vector of GMM component weights
-  V  - R x C x D array of eigenvoices
-  maxSpeakers - maximum number of speakers expected in the utterance
-  maxIters    - maximum number of algorithm iterations
-  epsilon     - stop iterating, if obj. fun. improvement is less than epsilon
-  loopProb    - probability of not switching speakers between frames
-  statScale   - deprecated, use Fa instead
-  Fa          - scale sufficient statiscits collected using UBM
-  Fb          - speaker regularization coefficient Fb (controls final # of speaker)
-  llScale     - scale UBM likelihood (i.e. llScale < 1.0 make atribution of
-                frames to UBM componets more uncertain)
-  sparsityThr - set occupations smaller that this threshold to 0.0 (saves memory
-                as the posteriors are represented by sparse matrix)
-  alphaQInit  - Dirichlet concentration parameter for initializing gamma
-  downsample  - perform diarization on input downsampled by this factor
-  VtinvSigmaV       - C x (R**2+R)/2 matrix normally calculated by VB_diarization when
-                VtinvSigmaV is None. However, it can be pre-calculated using function
-                precalculate_VtinvSigmaV(V) and used across calls of VB_diarization.
-  minDur      - minimum number of frames between speaker turns imposed by linear
-                chains of HMM states corresponding to each speaker. All the states
-                in a chain share the same output distribution
-  ref         - T dim. integer vector with reference speaker ID (0:maxSpeakers)
-                per frame
-  plot        - if set to True, plot per-frame speaker posteriors.
-
-   Outputs:
-   gamma  - S x T matrix of posteriors attribution each frame to one of S possible
-        speakers, where S is given by opts.maxSpeakers
-   pi - S dimensional column vector of ML learned speaker priors. Ideally, these
-        should allow to estimate # of speaker in the utterance as the
-        probabilities of the redundant speaker should converge to zero.
-   Li - values of auxiliary function (and DER and frame cross-entropy between gamma  
-        and reference if 'ref' is provided) over iterations.
-  """
-
-  # The references to equations corresponds to
-  # Diez. M., Burget. L., Landini. F., Cernocky. J.
-  # Analysis of Speaker Diarization based on Bayesian HMM with Eigenvoice Priors
-
-  D=X.shape[1]  # feature dimensionality
-  C=len(w)      # number of mixture components
-  R=V.shape[0]  # subspace rank
-  nframes=X.shape[0]
-
-  if VtinvSigmaV is None:
-    VtinvSigmaV = precalculate_VtinvSigmaV(V, invSigma)
-
-  V = V.reshape(V.shape[0],-1)
-
-  if pi is None:
-    pi = np.ones(maxSpeakers)/maxSpeakers
-  else:
-    maxSpeakers = len(pi)
-
-  if gamma is None:
-    # initialize gamma from flat Dirichlet prior with concentration parameter alphaQInit
-    gamma = np.random.gamma(alphaQInit, size=(nframes, maxSpeakers))
-    gamma = gamma / gamma.sum(1, keepdims=True)
-
-  # calculate UBM mixture frame posteriors (i.e. per-frame zero order statistics)
-  ll = (X**2).dot(-0.5*invSigma.T) + X.dot(invSigma.T*m.T)-0.5*((invSigma * m**2 - np.log(invSigma)).sum(1) - 2*np.log(w) + D*np.log(2*np.pi))
-  ll *= llScale
-  G = logsumexp_ne(ll, axis=1) 
-  zeta =  exp_ne(ll - G[:,np.newaxis])  
-  zeta[zeta<sparsityThr] = 0.0
-  zeta = zeta * statScale
-  G = G * statScale
-
-  #Kx = np.sum(zeta * (np.log(w) - np.log(zeta)), 1)
-  zeta = coo_matrix(zeta) # represent zero-order stats using sparse matrix
-  print('Sparsity: ', len(zeta.row), float(len(zeta.row))/np.prod(zeta.shape))
-  LL = np.sum(G) # total log-likelihod as calculated using UBM
-
-  mixture_sum = coo_matrix((np.ones(C*D), (np.repeat(range(C),D), range(C*D))))
-
-  #G = np.sum((zeta.multiply(ll - np.log(w))).toarray(), 1) + Kx  # from eq. (30) # Aleready calculated above
-
-  # Calculate per-frame first order statistics projected into the R-dim. subspace
-  # V^T \Sigma^{-1} F_m
-  F_s =coo_matrix((((X[zeta.row]-m[zeta.col])*zeta.data[:,np.newaxis]).flat,
-                   (zeta.row.repeat(D), zeta.col.repeat(D)*D+np.tile(range(D), len(zeta.col)))))
-  rho = F_s.tocsr().dot((invSigma.flat * V).T) ; del F_s
-  ## The code above is only efficient implementation of the following comented code
-  #rho = 0;
-  #for ii in range(C):
-  #  rho = rho + V[ii*D:(ii+1)*D,:].T.dot(zeta[ii,:] * invSigma[:,[ii]] *  (X - m[:,[ii]]))
-
-  if downsample is not None:
-    # Downsample zeta, rho, G and gamma by summing the statistic over 'downsample' frames
-    # This speeds-up diarization for the price of lowering its frame resolution
-    #downsampler = coo_matrix((np.ones(nframes), (np.ceil(np.arange(nframes)/downsample).astype(int), np.arange(nframes))))
-    downsampler = coo_matrix((np.ones(nframes), (np.ceil(np.arange(nframes)/downsample).astype(int), np.arange(nframes))))
-    zeta  = downsampler.dot(zeta)
-    rho   = downsampler.dot(rho)
-    G     = downsampler.dot(G)
-    gamma = downsampler.dot(gamma) / downsample
-  else:
-    downsampler=np.array(1)
-
-  Li = [[LL*Fa]] # for the 0-th iteration,
-  if ref is not None:
-    Li[-1] += [DER(downsampler.T.dot(gamma), ref), DER(downsampler.T.dot(gamma), ref, xentropy=True)]
-
-  ln_p = np.zeros_like(gamma)
-  tr = np.eye(minDur*maxSpeakers, k=1)
-  ip = np.zeros(minDur*maxSpeakers)
-  for ii in range(maxIters):
-    ELBO = 0                                                                   # objective function (11) (i.e. VB lower-bound on the evidence)
-    sum_gamma_zeta =   zeta.T.dot(gamma).T                                     # corresponds to the last sum in eq. (26) for all 's'
-    invLnoI_flat = sum_gamma_zeta.astype(VtinvSigmaV.dtype).dot(VtinvSigmaV)   # eq. (26) except for 'I' and the F_A F_B factors for all 's'
-    sum_gamma_rho = gamma.T.dot(rho)                                           # summation in eq. (17) 
-    for sid in range(maxSpeakers):
-        invL = np.linalg.inv(np.eye(R) + tril_to_sym(invLnoI_flat[sid])*Fa/Fb) # eq. (18) inverse
-        a = invL.dot(sum_gamma_rho[sid])*Fa/Fb                                 # eq. (17)
-        ln_p[:,sid] = Fa * (G + rho.dot(a) - 0.5 * zeta.dot(mixture_sum.dot(((invL+np.outer(a,a)).astype(V.dtype).dot(V) * (invSigma.flat * V)).sum(0)))) #eq. (23)
-        ELBO += Fb* 0.5 * (logdet(invL) - np.sum(np.diag(invL) + a**2, 0) + R)
-
-    # Construct transition probability matrix with linear chain of 'minDur'
-    # states for each of 'maxSpeaker' speaker. The last state in each chain has
-    # self-loop probability 'loopProb' and the transition probabilities to the
-    # initial chain states given by vector '(1-loopProb) * pi'. From all other,
-    # states, one must move to the next state in the chain with probability one.
-    tr[minDur-1::minDur,0::minDur]=(1-loopProb)*pi
-    tr[(np.arange(1,maxSpeakers+1)*minDur-1,)*2] += loopProb
-    ip[::minDur]=pi
-    # per-frame HMM state posteriors. Note that we can have linear chain of minDur states
-    # for each speaker.
-    gamma, tll, lf, lb = forward_backward(ln_p.repeat(minDur,axis=1), tr, ip) #, np.arange(1,maxSpeakers+1)*minDur-1)
-
-    # Right after updating q(Z), tll is E{log p(X|,Y,Z)} - KL{q(Z)||p(Z)}.
-    # ELBO now contains -KL{q(Y)||p(Y)}. Therefore, ELBO+ttl is correct value for ELBO.
-    ELBO += tll
-    Li.append([ELBO])
-
-    # ML estimate of speaker prior probabilities, eq. (24)
-    pi = gamma[0,::minDur] + np.exp(logsumexp(lf[:-1,minDur-1::minDur],axis=1)[:,np.newaxis]
-                           + lb[1:,::minDur] + ln_p[1:] + np.log((1-loopProb)*pi)-tll).sum(0)
-    pi = pi / pi.sum()
-
-    # per-frame speaker posteriors (eq. (19)), obtained by summing
-    # HMM state posteriors corresponding to each speaker
-    gamma = gamma.reshape(len(gamma),maxSpeakers,minDur).sum(axis=2)
-
-
-    # if reference is provided, report DER, cross-entropy and plot the figures
-    if ref is not None:
-      Li[-1] += [DER(downsampler.T.dot(gamma), ref), DER(downsampler.T.dot(gamma), ref, xentropy=True)]
+import numexpr as ne  # the dependency on this modul can be avoided by replacing
+
+# logsumexp_ne and exp_ne with logsumexp and np.exp
+
+# [gamma pi Li] =
+def VB_diarization(
+    X,
+    m,
+    invSigma,
+    w,
+    V,
+    pi=None,
+    gamma=None,
+    maxSpeakers=10,
+    maxIters=10,
+    epsilon=1e-4,
+    loopProb=0.99,
+    statScale=1.0,
+    alphaQInit=1.0,
+    downsample=None,
+    VtinvSigmaV=None,
+    ref=None,
+    plot=False,
+    sparsityThr=0.001,
+    llScale=1.0,
+    minDur=1,
+    Fa=1.0,
+    Fb=1.0,
+):
+
+    """
+    This a generalized version of speaker diarization described in:
+
+    Diez. M., Burget. L., Landini. F., Cernocky. J.
+    Analysis of Speaker Diarization based on Bayesian HMM with Eigenvoice Priors
+
+    Variable names and equation numbers refer to those used the paper
+
+    Inputs:
+    X  - T x D array, where columns are D dimensional feature vectors for T frames
+    m  - C x D array of GMM component means
+    invSigma - C x D array of GMM component inverse covariance matrix diagonals
+    w  - C dimensional column vector of GMM component weights
+    V  - R x C x D array of eigenvoices
+    maxSpeakers - maximum number of speakers expected in the utterance
+    maxIters    - maximum number of algorithm iterations
+    epsilon     - stop iterating, if obj. fun. improvement is less than epsilon
+    loopProb    - probability of not switching speakers between frames
+    statScale   - deprecated, use Fa instead
+    Fa          - scale sufficient statiscits collected using UBM
+    Fb          - speaker regularization coefficient Fb (controls final # of speaker)
+    llScale     - scale UBM likelihood (i.e. llScale < 1.0 make atribution of
+                  frames to UBM componets more uncertain)
+    sparsityThr - set occupations smaller that this threshold to 0.0 (saves memory
+                  as the posteriors are represented by sparse matrix)
+    alphaQInit  - Dirichlet concentration parameter for initializing gamma
+    downsample  - perform diarization on input downsampled by this factor
+    VtinvSigmaV       - C x (R**2+R)/2 matrix normally calculated by VB_diarization when
+                  VtinvSigmaV is None. However, it can be pre-calculated using function
+                  precalculate_VtinvSigmaV(V) and used across calls of VB_diarization.
+    minDur      - minimum number of frames between speaker turns imposed by linear
+                  chains of HMM states corresponding to each speaker. All the states
+                  in a chain share the same output distribution
+    ref         - T dim. integer vector with reference speaker ID (0:maxSpeakers)
+                  per frame
+    plot        - if set to True, plot per-frame speaker posteriors.
+
+     Outputs:
+     gamma  - S x T matrix of posteriors attribution each frame to one of S possible
+          speakers, where S is given by opts.maxSpeakers
+     pi - S dimensional column vector of ML learned speaker priors. Ideally, these
+          should allow to estimate # of speaker in the utterance as the
+          probabilities of the redundant speaker should converge to zero.
+     Li - values of auxiliary function (and DER and frame cross-entropy between gamma
+          and reference if 'ref' is provided) over iterations.
+    """
+
+    # The references to equations corresponds to
+    # Diez. M., Burget. L., Landini. F., Cernocky. J.
+    # Analysis of Speaker Diarization based on Bayesian HMM with Eigenvoice Priors
 
-      if plot:
-        import matplotlib.pyplot
-        if ii == 0: matplotlib.pyplot.clf()
-        matplotlib.pyplot.subplot(maxIters, 1, ii+1)
-        matplotlib.pyplot.plot(downsampler.T.dot(gamma), lw=2)
-        matplotlib.pyplot.imshow(np.atleast_2d(ref), interpolation='none', aspect='auto',
-                                 cmap=matplotlib.pyplot.cm.Pastel1, extent=(0, len(ref), -0.05, 1.05))
-      print(ii, Li[-2])
+    D = X.shape[1]  # feature dimensionality
+    C = len(w)  # number of mixture components
+    R = V.shape[0]  # subspace rank
+    nframes = X.shape[0]
 
+    if VtinvSigmaV is None:
+        VtinvSigmaV = precalculate_VtinvSigmaV(V, invSigma)
 
-    if ii > 0 and ELBO - Li[-2][0] < epsilon:
-      if ELBO - Li[-1][0] < 0: print('WARNING: Value of auxiliary function has decreased!')
-      break
+    V = V.reshape(V.shape[0], -1)
 
-  if downsample is not None:
-    # upsample resulting gamma to match number of frames in the input utterance
-    gamma = downsampler.T.dot(gamma)
+    if pi is None:
+        pi = np.ones(maxSpeakers) / maxSpeakers
+    else:
+        maxSpeakers = len(pi)
+
+    if gamma is None:
+        # initialize gamma from flat Dirichlet prior with concentration parameter alphaQInit
+        gamma = np.random.gamma(alphaQInit, size=(nframes, maxSpeakers))
+        gamma = gamma / gamma.sum(1, keepdims=True)
+
+    # calculate UBM mixture frame posteriors (i.e. per-frame zero order statistics)
+    ll = (
+        (X ** 2).dot(-0.5 * invSigma.T)
+        + X.dot(invSigma.T * m.T)
+        - 0.5
+        * (
+            (invSigma * m ** 2 - np.log(invSigma)).sum(1)
+            - 2 * np.log(w)
+            + D * np.log(2 * np.pi)
+        )
+    )
+    ll *= llScale
+    G = logsumexp_ne(ll, axis=1)
+    zeta = exp_ne(ll - G[:, np.newaxis])
+    zeta[zeta < sparsityThr] = 0.0
+    zeta = zeta * statScale
+    G = G * statScale
+
+    # Kx = np.sum(zeta * (np.log(w) - np.log(zeta)), 1)
+    zeta = coo_matrix(zeta)  # represent zero-order stats using sparse matrix
+    print("Sparsity: ", len(zeta.row), float(len(zeta.row)) / np.prod(zeta.shape))
+    LL = np.sum(G)  # total log-likelihod as calculated using UBM
+
+    mixture_sum = coo_matrix((np.ones(C * D), (np.repeat(range(C), D), range(C * D))))
+
+    # G = np.sum((zeta.multiply(ll - np.log(w))).toarray(), 1) + Kx  # from eq. (30) # Aleready calculated above
+
+    # Calculate per-frame first order statistics projected into the R-dim. subspace
+    # V^T \Sigma^{-1} F_m
+    F_s = coo_matrix(
+        (
+            ((X[zeta.row] - m[zeta.col]) * zeta.data[:, np.newaxis]).flat,
+            (
+                zeta.row.repeat(D),
+                zeta.col.repeat(D) * D + np.tile(range(D), len(zeta.col)),
+            ),
+        )
+    )
+    rho = F_s.tocsr().dot((invSigma.flat * V).T)
+    del F_s
+    ## The code above is only efficient implementation of the following comented code
+    # rho = 0;
+    # for ii in range(C):
+    #  rho = rho + V[ii*D:(ii+1)*D,:].T.dot(zeta[ii,:] * invSigma[:,[ii]] *  (X - m[:,[ii]]))
+
+    if downsample is not None:
+        # Downsample zeta, rho, G and gamma by summing the statistic over 'downsample' frames
+        # This speeds-up diarization for the price of lowering its frame resolution
+        # downsampler = coo_matrix((np.ones(nframes), (np.ceil(np.arange(nframes)/downsample).astype(int), np.arange(nframes))))
+        downsampler = coo_matrix(
+            (
+                np.ones(nframes),
+                (
+                    np.ceil(np.arange(nframes) / downsample).astype(int),
+                    np.arange(nframes),
+                ),
+            )
+        )
+        zeta = downsampler.dot(zeta)
+        rho = downsampler.dot(rho)
+        G = downsampler.dot(G)
+        gamma = downsampler.dot(gamma) / downsample
+    else:
+        downsampler = np.array(1)
 
-  return gamma, pi, Li
+    Li = [[LL * Fa]]  # for the 0-th iteration,
+    if ref is not None:
+        Li[-1] += [
+            DER(downsampler.T.dot(gamma), ref),
+            DER(downsampler.T.dot(gamma), ref, xentropy=True),
+        ]
+
+    ln_p = np.zeros_like(gamma)
+    tr = np.eye(minDur * maxSpeakers, k=1)
+    ip = np.zeros(minDur * maxSpeakers)
+    for ii in range(maxIters):
+        ELBO = 0  # objective function (11) (i.e. VB lower-bound on the evidence)
+        sum_gamma_zeta = zeta.T.dot(
+            gamma
+        ).T  # corresponds to the last sum in eq. (26) for all 's'
+        invLnoI_flat = sum_gamma_zeta.astype(VtinvSigmaV.dtype).dot(
+            VtinvSigmaV
+        )  # eq. (26) except for 'I' and the F_A F_B factors for all 's'
+        sum_gamma_rho = gamma.T.dot(rho)  # summation in eq. (17)
+        for sid in range(maxSpeakers):
+            invL = np.linalg.inv(
+                np.eye(R) + tril_to_sym(invLnoI_flat[sid]) * Fa / Fb
+            )  # eq. (18) inverse
+            a = invL.dot(sum_gamma_rho[sid]) * Fa / Fb  # eq. (17)
+            ln_p[:, sid] = Fa * (
+                G
+                + rho.dot(a)
+                - 0.5
+                * zeta.dot(
+                    mixture_sum.dot(
+                        (
+                            (invL + np.outer(a, a)).astype(V.dtype).dot(V)
+                            * (invSigma.flat * V)
+                        ).sum(0)
+                    )
+                )
+            )  # eq. (23)
+            ELBO += Fb * 0.5 * (logdet(invL) - np.sum(np.diag(invL) + a ** 2, 0) + R)
+
+        # Construct transition probability matrix with linear chain of 'minDur'
+        # states for each of 'maxSpeaker' speaker. The last state in each chain has
+        # self-loop probability 'loopProb' and the transition probabilities to the
+        # initial chain states given by vector '(1-loopProb) * pi'. From all other,
+        # states, one must move to the next state in the chain with probability one.
+        tr[minDur - 1 :: minDur, 0::minDur] = (1 - loopProb) * pi
+        tr[(np.arange(1, maxSpeakers + 1) * minDur - 1,) * 2] += loopProb
+        ip[::minDur] = pi
+        # per-frame HMM state posteriors. Note that we can have linear chain of minDur states
+        # for each speaker.
+        gamma, tll, lf, lb = forward_backward(
+            ln_p.repeat(minDur, axis=1), tr, ip
+        )  # , np.arange(1,maxSpeakers+1)*minDur-1)
+
+        # Right after updating q(Z), tll is E{log p(X|,Y,Z)} - KL{q(Z)||p(Z)}.
+        # ELBO now contains -KL{q(Y)||p(Y)}. Therefore, ELBO+ttl is correct value for ELBO.
+        ELBO += tll
+        Li.append([ELBO])
+
+        # ML estimate of speaker prior probabilities, eq. (24)
+        pi = gamma[0, ::minDur] + np.exp(
+            logsumexp(lf[:-1, minDur - 1 :: minDur], axis=1)[:, np.newaxis]
+            + lb[1:, ::minDur]
+            + ln_p[1:]
+            + np.log((1 - loopProb) * pi)
+            - tll
+        ).sum(0)
+        pi = pi / pi.sum()
+
+        # per-frame speaker posteriors (eq. (19)), obtained by summing
+        # HMM state posteriors corresponding to each speaker
+        gamma = gamma.reshape(len(gamma), maxSpeakers, minDur).sum(axis=2)
+
+        # if reference is provided, report DER, cross-entropy and plot the figures
+        if ref is not None:
+            Li[-1] += [
+                DER(downsampler.T.dot(gamma), ref),
+                DER(downsampler.T.dot(gamma), ref, xentropy=True),
+            ]
+
+            if plot:
+                import matplotlib.pyplot
+
+                if ii == 0:
+                    matplotlib.pyplot.clf()
+                matplotlib.pyplot.subplot(maxIters, 1, ii + 1)
+                matplotlib.pyplot.plot(downsampler.T.dot(gamma), lw=2)
+                matplotlib.pyplot.imshow(
+                    np.atleast_2d(ref),
+                    interpolation="none",
+                    aspect="auto",
+                    cmap=matplotlib.pyplot.cm.Pastel1,
+                    extent=(0, len(ref), -0.05, 1.05),
+                )
+            print(ii, Li[-2])
+
+        if ii > 0 and ELBO - Li[-2][0] < epsilon:
+            if ELBO - Li[-1][0] < 0:
+                print("WARNING: Value of auxiliary function has decreased!")
+            break
+
+    if downsample is not None:
+        # upsample resulting gamma to match number of frames in the input utterance
+        gamma = downsampler.T.dot(gamma)
+
+    return gamma, pi, Li
 
 
 def precalculate_VtinvSigmaV(V, invSigma):
     tril_ind = np.tril_indices(V.shape[0])
-    VtinvSigmaV = np.empty((V.shape[1],len(tril_ind[0])), V.dtype)
+    VtinvSigmaV = np.empty((V.shape[1], len(tril_ind[0])), V.dtype)
     for c in range(V.shape[1]):
-        VtinvSigmaV[c,:] = np.dot(V[:,c,:]*invSigma[np.newaxis,c,:], V[:,c,:].T)[tril_ind]
+        VtinvSigmaV[c, :] = np.dot(
+            V[:, c, :] * invSigma[np.newaxis, c, :], V[:, c, :].T
+        )[tril_ind]
     return VtinvSigmaV
 
 
@@ -230,15 +314,15 @@ def precalculate_VtinvSigmaV(V, invSigma):
 # (vector of per-frame zero based integer speaker IDs)
 def frame_labels2posterior_mx(labels):
     # initialize from reference
-    pmx = np.zeros((len(labels), labels.max()+1))
+    pmx = np.zeros((len(labels), labels.max() + 1))
     pmx[np.arange(len(labels)), labels] = 1
     return pmx
 
 
 # Calculates Diarization Error Rate (DER) or per-frame cross-entropy between
-# reference (vector of per-frame zero based integer speaker IDs) and gamma 
-# (per-frame speaker posteriors). If expected=False, gamma is converted into 
-# hard labels before calculating DER. If expected=TRUE, posteriors in gamma 
+# reference (vector of per-frame zero based integer speaker IDs) and gamma
+# (per-frame speaker posteriors). If expected=False, gamma is converted into
+# hard labels before calculating DER. If expected=TRUE, posteriors in gamma
 # are used to calculate "expected" DER.
 def DER(gamma, ref, expected=True, xentropy=False):
     from itertools import permutations
@@ -249,9 +333,9 @@ def DER(gamma, ref, expected=True, xentropy=False):
         gamma = np.zeros_like(gamma)
         gamma[range(len(gamma)), hard_labels] = 1
 
-    err_mx = np.empty((ref.max()+1, gamma.shape[1]))
+    err_mx = np.empty((ref.max() + 1, gamma.shape[1]))
     for s in range(err_mx.shape[0]):
-        tmpgamma = gamma[ref == s,:]
+        tmpgamma = gamma[ref == s, :]
         err_mx[s] = (-np.log(tmpgamma) if xentropy else tmpgamma).sum(0)
 
     if err_mx.shape[0] < err_mx.shape[1]:
@@ -259,12 +343,14 @@ def DER(gamma, ref, expected=True, xentropy=False):
 
     # try all alignments (permutations) of reference and detected speaker
     # could be written in more efficient way using dynamic programing
-    acc = [err_mx[perm[:err_mx.shape[1]], range(err_mx.shape[1])].sum()
-              for perm in permutations(range(err_mx.shape[0]))]
+    acc = [
+        err_mx[perm[: err_mx.shape[1]], range(err_mx.shape[1])].sum()
+        for perm in permutations(range(err_mx.shape[0]))
+    ]
     if xentropy:
-       return min(acc)/float(len(ref))
+        return min(acc) / float(len(ref))
     else:
-       return (len(ref) - max(acc))/float(len(ref))
+        return (len(ref) - max(acc)) / float(len(ref))
 
 
 ###############################################################################
@@ -275,9 +361,9 @@ def logsumexp(x, axis=0):
     x = xmax + np.log(np.sum(np.exp(x - np.expand_dims(xmax, axis)), axis))
     infs = np.isinf(xmax)
     if np.ndim(x) > 0:
-      x[infs] = xmax[infs]
+        x[infs] = xmax[infs]
     elif infs:
-      x = xmax
+        x = xmax
     return x
 
 
@@ -291,9 +377,9 @@ def logsumexp_ne(x, axis=0):
     x = ne.evaluate("xmax + log(x)")
     infs = np.isinf(xmax)
     if np.ndim(x) > 0:
-      x[infs] = xmax[infs]
+        x[infs] = xmax[infs]
     elif infs:
-      x = xmax
+        x = xmax
     return x
 
 
@@ -303,16 +389,16 @@ def exp_ne(x, out=None):
 
 # Convert vector with lower-triangular coefficients into symetric matrix
 def tril_to_sym(tril):
-    R = np.sqrt(len(tril)*2).astype(int)
+    R = np.sqrt(len(tril) * 2).astype(int)
     tril_ind = np.tril_indices(R)
-    S = np.empty((R,R))
-    S[tril_ind]       = tril
+    S = np.empty((R, R))
+    S[tril_ind] = tril
     S[tril_ind[::-1]] = tril
     return S
 
 
 def logdet(A):
-    return 2*np.sum(np.log(np.diag(spl.cholesky(A))))
+    return 2 * np.sum(np.log(np.diag(spl.cholesky(A))))
 
 
 def forward_backward(lls, tr, ip):
@@ -335,11 +421,11 @@ def forward_backward(lls, tr, ip):
     lfw[0] = lls[0] + np.log(ip)
     lbw[-1] = 0.0
 
-    for ii in  range(1,len(lls)):
-        lfw[ii] =  lls[ii] + logsumexp(lfw[ii-1] + ltr.T, axis=1)
+    for ii in range(1, len(lls)):
+        lfw[ii] = lls[ii] + logsumexp(lfw[ii - 1] + ltr.T, axis=1)
 
-    for ii in reversed(range(len(lls)-1)):
-        lbw[ii] = logsumexp(ltr + lls[ii+1] + lbw[ii+1], axis=1)
+    for ii in reversed(range(len(lls) - 1)):
+        lbw[ii] = logsumexp(ltr + lls[ii + 1] + lbw[ii + 1], axis=1)
 
     tll = logsumexp(lfw[-1])
     sp = np.exp(lfw + lbw - tll)
diff --git a/egs/libri_css/asr1/diarization/calc_cossim_scores.py b/egs/libri_css/asr1/diarization/calc_cossim_scores.py
index 076abc22d33..232be35a6c1 100644
--- a/egs/libri_css/asr1/diarization/calc_cossim_scores.py
+++ b/egs/libri_css/asr1/diarization/calc_cossim_scores.py
@@ -10,26 +10,28 @@
 
 
 def LoadReco2Utt(file):
-    if ':' in file:
-        file = file.split(':')[1]
-    IDs=dict()
-    with open(file,'r') as f:
+    if ":" in file:
+        file = file.split(":")[1]
+    IDs = dict()
+    with open(file, "r") as f:
         for line in f:
             ids = line.strip().split()
             IDs[ids[0]] = ids[1:]
     return IDs
 
+
 def ReadXvecs(rspec):
-    xvecs=dict()
+    xvecs = dict()
     with ReadHelper(rspec) as reader:
         for utid, xvec in reader:
             xvecs[utid] = xvec
     reader.close()
     return xvecs
 
+
 def Normalize(xvecs_in):
     N = len(xvecs_in)
-    xvec_mean=np.zeros(xvecs_in[0].shape)
+    xvec_mean = np.zeros(xvecs_in[0].shape)
     for i in range(N):
         xvec_mean += xvecs_in[i]
     xvec_mean /= N
@@ -39,33 +41,49 @@ def Normalize(xvecs_in):
         xvecs[i] = xvecs[i] / np.linalg.norm(xvecs[i])
     return xvecs
 
+
 def CalcCosSim(vecs):
-    return 1 - squareform(pdist(np.asarray(vecs), 'cosine'))
+    return 1 - squareform(pdist(np.asarray(vecs), "cosine"))
+
 
 def WriteDistMatrices(D, wspec):
     with WriteHelper(wspec) as writer:
         for id in D:
             writer(id, D[id])
 
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Usage: calc_cossim_scores.py <reco2utt-rspec> <xvec-rspec> <simmat-wspec>\nComputes matrices of the cosine similarity scores between normalized x-vectors for each recording')
-    parser.add_argument('reco2utt', type=str, help='Kaldi-style rspecifier of recording to segments correspondence')
-    parser.add_argument('xvec_rspec', type=str, help='Kaldi-style rspecifier of segment xvectors to read')
-    parser.add_argument('simmat_wspec', type=str, help='Kaldi-style wspecifier of similarity matrices to write')
-    args = parser.parse_args()
 
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Usage: calc_cossim_scores.py <reco2utt-rspec> <xvec-rspec> <simmat-wspec>\nComputes matrices of the cosine similarity scores between normalized x-vectors for each recording"
+    )
+    parser.add_argument(
+        "reco2utt",
+        type=str,
+        help="Kaldi-style rspecifier of recording to segments correspondence",
+    )
+    parser.add_argument(
+        "xvec_rspec",
+        type=str,
+        help="Kaldi-style rspecifier of segment xvectors to read",
+    )
+    parser.add_argument(
+        "simmat_wspec",
+        type=str,
+        help="Kaldi-style wspecifier of similarity matrices to write",
+    )
+    args = parser.parse_args()
 
-    print('Computing cosine similarity matrix between ivectors')
-    print('Parameters:')
-    print('Reco2Utt rspecifier: {}'.format(args.reco2utt))
-    print('Xvectors rspecifier: {}'.format(args.xvec_rspec))
-    print('Similarity matrices wspecifier: {}'.format(args.simmat_wspec))
+    print("Computing cosine similarity matrix between ivectors")
+    print("Parameters:")
+    print("Reco2Utt rspecifier: {}".format(args.reco2utt))
+    print("Xvectors rspecifier: {}".format(args.xvec_rspec))
+    print("Similarity matrices wspecifier: {}".format(args.simmat_wspec))
 
     IDs = LoadReco2Utt(args.reco2utt)
     xvecs_all = ReadXvecs(args.xvec_rspec)
     D = dict()
     for reco_id in IDs:
-        xvecs = [ xvecs_all[id] for id in IDs[reco_id] ]
-        xvecs = Normalize(xvecs)                              # !!!! Normalize per recording (session) !!!!
+        xvecs = [xvecs_all[id] for id in IDs[reco_id]]
+        xvecs = Normalize(xvecs)  # !!!! Normalize per recording (session) !!!!
         D[reco_id] = CalcCosSim(xvecs)
-    WriteDistMatrices(D, args.simmat_wspec)
\ No newline at end of file
+    WriteDistMatrices(D, args.simmat_wspec)
diff --git a/egs/libri_css/asr1/diarization/make_rttm.py b/egs/libri_css/asr1/diarization/make_rttm.py
index fc32eafd530..6d39371b0da 100755
--- a/egs/libri_css/asr1/diarization/make_rttm.py
+++ b/egs/libri_css/asr1/diarization/make_rttm.py
@@ -38,91 +38,107 @@
 
 
 def get_args():
-  parser = argparse.ArgumentParser(
-    description="""This script converts a segments and labels file
+    parser = argparse.ArgumentParser(
+        description="""This script converts a segments and labels file
     to a NIST RTTM file. It handles overlapping segments (e.g. the
-    output of a sliding-window diarization system).""")
+    output of a sliding-window diarization system)."""
+    )
 
-  parser.add_argument("segments", type=str,
-                      help="Input segments file")
-  parser.add_argument("labels", type=str,
-                      help="Input labels file")
-  parser.add_argument("rttm_file", type=str,
-                      help="Output RTTM file")
-  parser.add_argument("--rttm-channel", type=int, default=0,
-                      help="The value passed into the RTTM channel field. \
-                      Only affects the format of the RTTM file.")
+    parser.add_argument("segments", type=str, help="Input segments file")
+    parser.add_argument("labels", type=str, help="Input labels file")
+    parser.add_argument("rttm_file", type=str, help="Output RTTM file")
+    parser.add_argument(
+        "--rttm-channel",
+        type=int,
+        default=0,
+        help="The value passed into the RTTM channel field. \
+                      Only affects the format of the RTTM file.",
+    )
+
+    args = parser.parse_args()
+    return args
 
-  args = parser.parse_args()
-  return args
 
 def main():
-  args = get_args()
-
-  # File containing speaker labels per segment
-  seg2label = {}
-  with codecs.open(args.labels, 'r', 'utf-8') as labels_file:
-    for line in labels_file:
-      seg, label = line.strip().split()
-      seg2label[seg] = label
-
-  # Segments file
-  reco2segs = {}
-  with codecs.open(args.segments, 'r', 'utf-8') as segments_file:
-    for line in segments_file:
-      seg, reco, start, end = line.strip().split()
-      try:
-        if reco in reco2segs:
-          reco2segs[reco] = reco2segs[reco] + " " + start + "," + end + "," + seg2label[seg]
-        else:
-          reco2segs[reco] = reco + " " + start + "," + end + "," + seg2label[seg]
-      except KeyError:
-        raise RuntimeError("Missing label for segment {0}".format(seg))
-
-  # Cut up overlapping segments so they are contiguous
-  contiguous_segs = []
-  for reco in sorted(reco2segs):
-    segs = reco2segs[reco].strip().split()
-    new_segs = ""
-    for i in range(1, len(segs)-1):
-      start, end, label = segs[i].split(',')
-      next_start, next_end, next_label = segs[i+1].split(',')
-      if float(end) > float(next_start):
-        done = False
-        avg = str((float(next_start) + float(end)) / 2.0)
-        segs[i+1] = ','.join([avg, next_end, next_label])
-        new_segs += " " + start + "," + avg + "," + label
-      else:
+    args = get_args()
+
+    # File containing speaker labels per segment
+    seg2label = {}
+    with codecs.open(args.labels, "r", "utf-8") as labels_file:
+        for line in labels_file:
+            seg, label = line.strip().split()
+            seg2label[seg] = label
+
+    # Segments file
+    reco2segs = {}
+    with codecs.open(args.segments, "r", "utf-8") as segments_file:
+        for line in segments_file:
+            seg, reco, start, end = line.strip().split()
+            try:
+                if reco in reco2segs:
+                    reco2segs[reco] = (
+                        reco2segs[reco] + " " + start + "," + end + "," + seg2label[seg]
+                    )
+                else:
+                    reco2segs[reco] = (
+                        reco + " " + start + "," + end + "," + seg2label[seg]
+                    )
+            except KeyError:
+                raise RuntimeError("Missing label for segment {0}".format(seg))
+
+    # Cut up overlapping segments so they are contiguous
+    contiguous_segs = []
+    for reco in sorted(reco2segs):
+        segs = reco2segs[reco].strip().split()
+        new_segs = ""
+        for i in range(1, len(segs) - 1):
+            start, end, label = segs[i].split(",")
+            next_start, next_end, next_label = segs[i + 1].split(",")
+            if float(end) > float(next_start):
+                done = False
+                avg = str((float(next_start) + float(end)) / 2.0)
+                segs[i + 1] = ",".join([avg, next_end, next_label])
+                new_segs += " " + start + "," + avg + "," + label
+            else:
+                new_segs += " " + start + "," + end + "," + label
+        start, end, label = segs[-1].split(",")
         new_segs += " " + start + "," + end + "," + label
-    start, end, label = segs[-1].split(',')
-    new_segs += " " + start + "," + end + "," + label
-    contiguous_segs.append(reco + new_segs)
-
-  # Merge contiguous segments of the same label
-  merged_segs = []
-  for reco_line in contiguous_segs:
-    segs = reco_line.strip().split()
-    reco = segs[0]
-    new_segs = ""
-    for i in range(1, len(segs)-1):
-      start, end, label = segs[i].split(',')
-      next_start, next_end, next_label = segs[i+1].split(',')
-      if float(end) == float(next_start) and label == next_label:
-        segs[i+1] = ','.join([start, next_end, next_label])
-      else:
+        contiguous_segs.append(reco + new_segs)
+
+    # Merge contiguous segments of the same label
+    merged_segs = []
+    for reco_line in contiguous_segs:
+        segs = reco_line.strip().split()
+        reco = segs[0]
+        new_segs = ""
+        for i in range(1, len(segs) - 1):
+            start, end, label = segs[i].split(",")
+            next_start, next_end, next_label = segs[i + 1].split(",")
+            if float(end) == float(next_start) and label == next_label:
+                segs[i + 1] = ",".join([start, next_end, next_label])
+            else:
+                new_segs += " " + start + "," + end + "," + label
+        start, end, label = segs[-1].split(",")
         new_segs += " " + start + "," + end + "," + label
-    start, end, label = segs[-1].split(',')
-    new_segs += " " + start + "," + end + "," + label
-    merged_segs.append(reco + new_segs)
-
-  with codecs.open(args.rttm_file, 'w', 'utf-8') as rttm_writer:
-    for reco_line in merged_segs:
-      segs = reco_line.strip().split()
-      reco = segs[0]
-      for i in range(1, len(segs)):
-        start, end, label = segs[i].strip().split(',')
-        print("SPEAKER {0} {1} {2:7.3f} {3:7.3f} <NA> <NA> {4} <NA> <NA>".format(
-          reco, args.rttm_channel, float(start), float(end)-float(start), label), file=rttm_writer)
-
-if __name__ == '__main__':
-  main()
+        merged_segs.append(reco + new_segs)
+
+    with codecs.open(args.rttm_file, "w", "utf-8") as rttm_writer:
+        for reco_line in merged_segs:
+            segs = reco_line.strip().split()
+            reco = segs[0]
+            for i in range(1, len(segs)):
+                start, end, label = segs[i].strip().split(",")
+                print(
+                    "SPEAKER {0} {1} {2:7.3f} {3:7.3f} <NA> <NA> {4} <NA> <NA>".format(
+                        reco,
+                        args.rttm_channel,
+                        float(start),
+                        float(end) - float(start),
+                        label,
+                    ),
+                    file=rttm_writer,
+                )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/libri_css/asr1/diarization/spec_clust.py b/egs/libri_css/asr1/diarization/spec_clust.py
index 30dc85dce6c..37a1c0ed924 100755
--- a/egs/libri_css/asr1/diarization/spec_clust.py
+++ b/egs/libri_css/asr1/diarization/spec_clust.py
@@ -11,25 +11,27 @@
 import scipy
 from sklearn.cluster import SpectralClustering
 
-'''
+"""
    Spectral Clustering based on binarization and automatic thresholding
    Paper: T.Park, K.Han, M.Kumar, and S.Narayanan, Auto-tuning spectral clustering for speaker diarization using normalized maximumeigengap, IEEE Signal Processing Letters, vol. 27, pp. 381-385,2019
-'''
+"""
 
 #   Input-output routines
 
+
 def LoadAffinityMatrix(file):
-    Matrices=dict()
+    Matrices = dict()
     with ReadHelper(file) as reader:
         for key, np_arr in reader:
             Matrices[key] = np_arr
     return Matrices
 
+
 def LoadReco2Utt(file):
-    if ':' in file:
-        file = file.split(':')[1]
-    IDs=dict()
-    with open(file,'r') as f:
+    if ":" in file:
+        file = file.split(":")[1]
+    IDs = dict()
+    with open(file, "r") as f:
         for line in f:
             ids = line.strip().split()
             IDs[ids[0]] = ids[1:]
@@ -37,22 +39,24 @@ def LoadReco2Utt(file):
 
 
 def LoadReco2NumSpk(file):
-    if ':' in file:
-        file = file.split(':')[1]
-    NumSpk=dict()
-    with open(file,'r') as f:
+    if ":" in file:
+        file = file.split(":")[1]
+    NumSpk = dict()
+    with open(file, "r") as f:
         for line in f:
             ids = line.strip().split()
             NumSpk[ids[0]] = int(ids[1])
     return NumSpk
 
+
 def SaveLabels(IDs, labels, file):
-    if ':' in file:
-        file = file.split(':')[1]
-    with open(file,'w') as f:
+    if ":" in file:
+        file = file.split(":")[1]
+    with open(file, "w") as f:
         for id in IDs:
             for i in range(len(IDs[id])):
-                f.write('{} {}\n'.format(IDs[id][i], labels[id][i]+1))
+                f.write("{} {}\n".format(IDs[id][i], labels[id][i] + 1))
+
 
 #   NME low-level operations
 
@@ -66,45 +70,49 @@ def get_kneighbors_conn(X_dist, p_neighbors):
         X_dist_out[indices, i] = 1
     return X_dist_out
 
+
 # Thresolds affinity matrix to leave p maximum non-zero elements in each row
 def Threshold(A, p):
     N = A.shape[0]
-    Ap = np.zeros((N,N))
+    Ap = np.zeros((N, N))
     for i in range(N):
-        thr = sorted(A[i,:], reverse=True)[p]
-        Ap[i,A[i,:]>thr] = A[i,A[i,:]>thr]
+        thr = sorted(A[i, :], reverse=True)[p]
+        Ap[i, A[i, :] > thr] = A[i, A[i, :] > thr]
     return Ap
 
+
 # Computes Laplacian of a matrix
 def Laplacian(A):
-    d = np.sum(A, axis=1)-np.diag(A)
+    d = np.sum(A, axis=1) - np.diag(A)
     D = np.diag(d)
     return D - A
 
+
 # Calculates eigengaps (differences between adjacent eigenvalues sorted in descending order)
 def Eigengap(S):
     S = sorted(S)
     return np.diff(S)
 
+
 # Computes parameters of normalized eigenmaps for automatic thresholding selection
 def ComputeNMEParameters(A, p, max_num_clusters):
     # p-Neighbour binarization
     Ap = get_kneighbors_conn(A, p)
     # Symmetrization
-    Ap = (Ap + np.transpose(Ap))/2
+    Ap = (Ap + np.transpose(Ap)) / 2
     # Laplacian matrix computation
     Lp = Laplacian(Ap)
     # EigenValue Decomposition
     S, eig_vecs = scipy.linalg.eigh(Lp)
     # Eigengap computation
     e = Eigengap(S)
-    g = np.max(e[:max_num_clusters])/(np.max(S)+1e-10)
-    r = p/g
+    g = np.max(e[:max_num_clusters]) / (np.max(S) + 1e-10)
+    r = p / g
     k = np.argmax(e[:max_num_clusters])
     return (e, g, k, r)
 
 
-'''
+"""
 Performs spectral clustering with Normalized Maximum Eigengap (NME)
 Parameters:
    A: affinity matrix (matrix of pairwise cosine similarities or PLDA scores between speaker embeddings)
@@ -113,75 +121,123 @@ def ComputeNMEParameters(A, p, max_num_clusters):
    pmax: maximum count for matrix binarization (should be at least 2)
    pbest: best count for matrix binarization (if 0, determined automatically)
 Returns: cluster assignments for every speaker embedding   
-'''
-def NME_SpectralClustering(A, num_clusters = None, max_num_clusters = 10, pbest = 0, pmax = 20):
-    if pbest==0:
-        print('Selecting best number of neighbors for affinity matrix thresolding:')
+"""
+
+
+def NME_SpectralClustering(A, num_clusters=None, max_num_clusters=10, pbest=0, pmax=20):
+    if pbest == 0:
+        print("Selecting best number of neighbors for affinity matrix thresolding:")
         rbest = None
         kbest = None
-        for p in range(2, pmax+1):
+        for p in range(2, pmax + 1):
             e, g, k, r = ComputeNMEParameters(A, p, max_num_clusters)
-            print('p={}, r={}'.format(p,r))
+            print("p={}, r={}".format(p, r))
             if rbest is None or rbest > r:
                 rbest = r
                 pbest = p
                 kbest = k
-        print('Best number of neighbors is {}'.format(pbest))
-        return NME_SpectralClustering_sklearn(A, num_clusters if num_clusters is not None else (kbest+1), pbest)
+        print("Best number of neighbors is {}".format(pbest))
+        return NME_SpectralClustering_sklearn(
+            A, num_clusters if num_clusters is not None else (kbest + 1), pbest
+        )
     if num_clusters is None:
-        print('Compute number of clusters to generate:')
+        print("Compute number of clusters to generate:")
         e, g, r, k = ComputeNMEParameters(A, p)
-        print('Number of clusters to generate is {}'.format(k+1))
-        return NME_SpectralClustering_sklearn(A, k+1, pbest)
+        print("Number of clusters to generate is {}".format(k + 1))
+        return NME_SpectralClustering_sklearn(A, k + 1, pbest)
     return NME_SpectralClustering_sklearn(A, num_clusters, pbest)
 
-'''
+
+"""
 Performs spectral clustering with Normalized Maximum Eigengap (NME) with fixed threshold and number of clusters
 Parameters:
    A: affinity matrix (matrix of pairwise cosine similarities or PLDA scores between speaker embeddings)
    num_clusters: number of clusters to generate
    pbest: best count for matrix binarization
 Returns: cluster assignments for every speaker embedding   
-'''
+"""
+
+
 def NME_SpectralClustering_sklearn(A, num_clusters, pbest):
     Ap = Threshold(A, pbest)
     Ap = (Ap + np.transpose(Ap)) / 2
-    model = SpectralClustering(n_clusters = num_clusters, affinity='precomputed', random_state=0)
+    model = SpectralClustering(
+        n_clusters=num_clusters, affinity="precomputed", random_state=0
+    )
     labels = model.fit_predict(Ap)
     return labels
 
 
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Usage: spec_clust.py [options] <scores-rspec> <reco2utt-rspec> <labels-wspec>\n' +
-                                                 'Performs spectral clustering of xvectors according to pairwise similarity scores\n' +
-                                                 'Auto-selects binarization threshold')
-    parser.add_argument('simmat_rspec', type=str, help='Kaldi-style rspecifier of similarity scores matrices to read')
-    parser.add_argument('reco2utt_rspec', type=str, help='Kaldi-style rspecifier of recording-to-utterances correspondence')
-    parser.add_argument('labels_wspec', type=str, help='Kaldi-style wspecifier to save xvector cluster labels')
-    parser.add_argument('--max_neighbors', type=int, default=20, help='Maximum number of neighbors to threshold similarity matrix')
-    parser.add_argument('--reco2num_spk', type=str, default='', help='Kaldi-style rspecifier of recording-to-numofspeakers correspondence')
-    parser.add_argument('--num_clusters', type=int, default=None, help='Number of clusters to generate. Ignored if --reco2num_spk is given')
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Usage: spec_clust.py [options] <scores-rspec> <reco2utt-rspec> <labels-wspec>\n"
+        + "Performs spectral clustering of xvectors according to pairwise similarity scores\n"
+        + "Auto-selects binarization threshold"
+    )
+    parser.add_argument(
+        "simmat_rspec",
+        type=str,
+        help="Kaldi-style rspecifier of similarity scores matrices to read",
+    )
+    parser.add_argument(
+        "reco2utt_rspec",
+        type=str,
+        help="Kaldi-style rspecifier of recording-to-utterances correspondence",
+    )
+    parser.add_argument(
+        "labels_wspec",
+        type=str,
+        help="Kaldi-style wspecifier to save xvector cluster labels",
+    )
+    parser.add_argument(
+        "--max_neighbors",
+        type=int,
+        default=20,
+        help="Maximum number of neighbors to threshold similarity matrix",
+    )
+    parser.add_argument(
+        "--reco2num_spk",
+        type=str,
+        default="",
+        help="Kaldi-style rspecifier of recording-to-numofspeakers correspondence",
+    )
+    parser.add_argument(
+        "--num_clusters",
+        type=int,
+        default=None,
+        help="Number of clusters to generate. Ignored if --reco2num_spk is given",
+    )
     args = parser.parse_args()
 
-    assert args.max_neighbors > 1, 'Maximum number of neighpors should be at least 2, {} passed\n'.format(args.max_neighbors)
+    assert (
+        args.max_neighbors > 1
+    ), "Maximum number of neighpors should be at least 2, {} passed\n".format(
+        args.max_neighbors
+    )
 
-    print('Spectral clustering of xvector according to precomputed similarity scores matrix')
-    print('Parameters:')
-    print('Similarity matrix rspecifier: {}'.format(args.simmat_rspec))
-    print('Reco2Utt rspecifier: {}'.format(args.reco2utt_rspec))
-    print('Labels wspecifier: {}'.format(args.labels_wspec))
-    print('Number of clusters to generate: {}'.format(args.num_clusters))
-    print('Maximum number of nighbors to threshold similarity matrix: {}\n'.format(args.max_neighbors))
-    print('Reco2NumSpk rspecifier: {}'.format(args.reco2num_spk))
+    print(
+        "Spectral clustering of xvector according to precomputed similarity scores matrix"
+    )
+    print("Parameters:")
+    print("Similarity matrix rspecifier: {}".format(args.simmat_rspec))
+    print("Reco2Utt rspecifier: {}".format(args.reco2utt_rspec))
+    print("Labels wspecifier: {}".format(args.labels_wspec))
+    print("Number of clusters to generate: {}".format(args.num_clusters))
+    print(
+        "Maximum number of nighbors to threshold similarity matrix: {}\n".format(
+            args.max_neighbors
+        )
+    )
+    print("Reco2NumSpk rspecifier: {}".format(args.reco2num_spk))
 
-    print('Loading affinity matrices...', end='')
+    print("Loading affinity matrices...", end="")
     Matrices = LoadAffinityMatrix(args.simmat_rspec)
-    print('done')
-    print('Loading Reco2Utt correspondence...', end='')
+    print("done")
+    print("Loading Reco2Utt correspondence...", end="")
     IDs = LoadReco2Utt(args.reco2utt_rspec)
-    print('done')
+    print("done")
 
-    if args.reco2num_spk != '':
+    if args.reco2num_spk != "":
         NumSpk = LoadReco2NumSpk(args.reco2num_spk)
 
     Labels = dict()
@@ -189,12 +245,18 @@ def NME_SpectralClustering_sklearn(A, num_clusters, pbest):
         A = Matrices[id]
         IDList = IDs[id]
 
-        num_clusters = args.num_clusters if args.reco2num_spk == '' else NumSpk[id]
-        assert num_clusters is None or num_clusters > 0, 'Positive number of clusters expected for {}, {} found\n'.format(id, num_clusters)
+        num_clusters = args.num_clusters if args.reco2num_spk == "" else NumSpk[id]
+        assert (
+            num_clusters is None or num_clusters > 0
+        ), "Positive number of clusters expected for {}, {} found\n".format(
+            id, num_clusters
+        )
 
-        print('Start clustering for recording {}...'.format(id))
-        Labels[id] = NME_SpectralClustering(A, num_clusters = num_clusters, pmax = args.max_neighbors)
-        print('Clustering done')
-    print( 'Saving labels...')
+        print("Start clustering for recording {}...".format(id))
+        Labels[id] = NME_SpectralClustering(
+            A, num_clusters=num_clusters, pmax=args.max_neighbors
+        )
+        print("Clustering done")
+    print("Saving labels...")
     SaveLabels(IDs, Labels, args.labels_wspec)
-    print('done')
\ No newline at end of file
+    print("done")
diff --git a/egs/libri_css/asr1/diarization/vb_hmm_xvector.py b/egs/libri_css/asr1/diarization/vb_hmm_xvector.py
index 2907cc2d114..76e128bbf1f 100644
--- a/egs/libri_css/asr1/diarization/vb_hmm_xvector.py
+++ b/egs/libri_css/asr1/diarization/vb_hmm_xvector.py
@@ -20,50 +20,71 @@
 
 ########### HELPER FUNCTIONS #####################################
 
+
 def get_args():
     parser = argparse.ArgumentParser(
         description="""This script performs Bayesian HMM-based
             clustering of x-vectors for one recording""",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument("--init-smoothing", type=float, default=10,
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--init-smoothing",
+        type=float,
+        default=10,
         help="AHC produces hard assignments of x-vetors to speakers."
         " These are smoothed to soft assignments as the initialization"
         " for VB-HMM. This parameter controls the amount of smoothing."
-        " Not so important, high value (e.g. 10) is OK  => keeping hard assigment")
-    parser.add_argument("--loop-prob", type=float, default=0.80,
-                        help="probability of not switching speakers between frames")
-    parser.add_argument("--fa", type=float, default=0.4,
-                        help="scale sufficient statistics collected using UBM")
-    parser.add_argument("--fb", type=float, default=11,
-                        help="speaker regularization coefficient Fb (controls final # of speaker)")
-    parser.add_argument("xvector_ark_file", type=str,
-                        help="Ark file containing xvectors for all subsegments")
-    parser.add_argument("plda", type=str,
-                        help="path to PLDA model")
-    parser.add_argument("input_label_file", type=str,
-                        help="path of input label file")
-    parser.add_argument("output_label_file", type=str,
-                        help="path of output label file")
+        " Not so important, high value (e.g. 10) is OK  => keeping hard assigment",
+    )
+    parser.add_argument(
+        "--loop-prob",
+        type=float,
+        default=0.80,
+        help="probability of not switching speakers between frames",
+    )
+    parser.add_argument(
+        "--fa",
+        type=float,
+        default=0.4,
+        help="scale sufficient statistics collected using UBM",
+    )
+    parser.add_argument(
+        "--fb",
+        type=float,
+        default=11,
+        help="speaker regularization coefficient Fb (controls final # of speaker)",
+    )
+    parser.add_argument(
+        "xvector_ark_file",
+        type=str,
+        help="Ark file containing xvectors for all subsegments",
+    )
+    parser.add_argument("plda", type=str, help="path to PLDA model")
+    parser.add_argument("input_label_file", type=str, help="path of input label file")
+    parser.add_argument("output_label_file", type=str, help="path of output label file")
     args = parser.parse_args()
     return args
 
+
 def read_labels_file(label_file):
     segments = []
     labels = []
-    with open(label_file, 'r') as f:
+    with open(label_file, "r") as f:
         for line in f.readlines():
             segment, label = line.strip().split()
             segments.append(segment)
             labels.append(int(label))
     return segments, labels
 
+
 def write_labels_file(seg2label, out_file):
-    f = open(out_file, 'w')
+    f = open(out_file, "w")
     for seg in sorted(seg2label.keys()):
         f.write("{} {}\n".format(seg, seg2label[seg]))
     f.close()
     return
 
+
 def read_args(args):
     segments, labels = read_labels_file(args.input_label_file)
     xvec_all = dict(kaldi_io.read_vec_flt_ark(args.xvector_ark_file))
@@ -76,40 +97,62 @@ def read_args(args):
 
 ###################################################################
 
+
 def vb_hmm(segments, in_labels, xvectors, plda_psi, init_smoothing, loop_prob, fa, fb):
     x = np.array(xvectors)
     dim = x.shape[1]
 
     # Smooth the hard labels obtained from AHC to soft assignments of x-vectors to speakers
-    q_init = np.zeros((len(in_labels), np.max(in_labels)+1))
+    q_init = np.zeros((len(in_labels), np.max(in_labels) + 1))
     q_init[range(len(in_labels)), in_labels] = 1.0
-    q_init = softmax(q_init*init_smoothing, axis=1)
+    q_init = softmax(q_init * init_smoothing, axis=1)
 
     # Prepare model for VB-HMM clustering
     ubmWeights = np.array([1.0])
-    ubmMeans = np.zeros((1,dim))
-    invSigma= np.ones((1,dim))
-    V=np.diag(np.sqrt(plda_psi[:dim]))[:,np.newaxis,:]
+    ubmMeans = np.zeros((1, dim))
+    invSigma = np.ones((1, dim))
+    V = np.diag(np.sqrt(plda_psi[:dim]))[:, np.newaxis, :]
 
     # Use VB-HMM for x-vector clustering. Instead of i-vector extractor model, we use PLDA
-    # => GMM with only 1 component, V derived across-class covariance, and invSigma is inverse 
+    # => GMM with only 1 component, V derived across-class covariance, and invSigma is inverse
     # within-class covariance (i.e. identity)
-    q, _, _ = VB_diarization.VB_diarization(x, ubmMeans, invSigma, ubmWeights, V, pi=None, 
-        gamma=q_init, maxSpeakers=q_init.shape[1], maxIters=40, epsilon=1e-6, loopProb=loop_prob,
-        Fa=fa, Fb=fb)
-
-    labels = np.unique(q.argmax(1), return_inverse=True)[1] 
+    q, _, _ = VB_diarization.VB_diarization(
+        x,
+        ubmMeans,
+        invSigma,
+        ubmWeights,
+        V,
+        pi=None,
+        gamma=q_init,
+        maxSpeakers=q_init.shape[1],
+        maxIters=40,
+        epsilon=1e-6,
+        loopProb=loop_prob,
+        Fa=fa,
+        Fb=fb,
+    )
+
+    labels = np.unique(q.argmax(1), return_inverse=True)[1]
+
+    return {seg: label for seg, label in zip(segments, labels)}
 
-    return {seg:label for seg,label in zip(segments,labels)}
 
 def main():
     args = get_args()
     xvectors, segments, labels, plda_psi = read_args(args)
 
-    seg2label_vb = vb_hmm(segments, labels, xvectors, plda_psi, args.init_smoothing, 
-        args.loop_prob, args.fa, args.fb)
+    seg2label_vb = vb_hmm(
+        segments,
+        labels,
+        xvectors,
+        plda_psi,
+        args.init_smoothing,
+        args.loop_prob,
+        args.fa,
+        args.fb,
+    )
     write_labels_file(seg2label_vb, args.output_label_file)
 
-if __name__=="__main__":
-    main()
 
+if __name__ == "__main__":
+    main()
diff --git a/egs/librispeech/asr1/RESULTS.md b/egs/librispeech/asr1/RESULTS.md
index 385fddb2837..f4140eac0ee 100644
--- a/egs/librispeech/asr1/RESULTS.md
+++ b/egs/librispeech/asr1/RESULTS.md
@@ -63,6 +63,37 @@ exp/train_960_pytorch_train_pytorch_conformer_large_specaug/decode_test_other_mo
 |    Sum/Avg         |    2939        52343     |    95.3          4.1           0.6          0.6           5.3         44.8     |
 ```
 
+# pytorch large conformer-transducer with specaug + speed perturbation (4 GPUs)
+
+- Environments
+  - python version: `3.8.3 (default)  [GCC 7.3.0]`
+  - espnet version: `espnet 0.10.7a1`
+  - chainer version: `chainer 6.0.0`
+  - pytorch version: `pytorch 1.10.0`
+
+- Model files (archived to model.tar.gz by `$ pack_model.sh`)
+    - model link: ([pretrained model](https://drive.google.com/file/d/1fdadICi2w_b6lqb9_7J3wfRJc3LTnnSq/view?usp=sharing))
+    - training config file: `conf/tuning/transducer/train_conformer-rnn_transducer.yaml`
+    - decoding config file: `conf/tuning/transducer/decode.yaml`
+    - cmvn file: `data/train_sp/cmvn.ark`
+    - e2e file: `exp/train_960_pytorch_transducer_train_conformer-rnn_transducer/results/model.last10.avg.best`
+    - e2e JSON file: `exp/train_960_pytorch_transducer_train_conformer-rnn_transducer/results/model.json`
+    - dict file: `data/lang_char`
+  - Results (paste them by yourself or obtained by `$ pack_model.sh --results <results>`)
+```
+exp/train_960_pytorch_transducer_train_conformer-rnn_transducer/decode_dev_clean_model.last10.avg.best/result.wrd.txt
+|    SPKR           |    # Snt       # Wrd     |    Corr          Sub          Del          Ins           Err        S.Err    |
+|    Sum/Avg        |    2703        54402     |    97.6          2.2          0.2          0.3           2.7         33.0    |
+exp/train_960_pytorch_transducer_train_conformer-rnn_transducer/decode_dev_other_model.last10.avg.best/result.wrd.txt
+|    SPKR           |    # Snt       # Wrd     |    Corr          Sub          Del          Ins           Err        S.Err    |
+|    Sum/Avg        |    2864        50948     |    93.7          5.7          0.6          0.7           7.0         52.8    |
+exp/train_960_pytorch_transducer_train_conformer-rnn_transducer/decode_test_clean_model.last10.avg.best/result.wrd.txt
+|    SPKR           |    # Snt        # Wrd    |    Corr          Sub           Del          Ins          Err         S.Err    |
+|    Sum/Avg        |    2620         52576    |    97.4          2.3           0.3          0.3          2.9          33.1    |
+exp/train_960_pytorch_transducer_train_conformer-rnn_transducer/decode_test_other_model.last10.avg.best/result.wrd.txt
+|    SPKR           |    # Snt        # Wrd    |    Corr          Sub           Del          Ins          Err         S.Err    |
+|    Sum/Avg        |    2939         52343    |    93.7          5.6           0.7          0.8          7.1          55.1    |
+```
 
 # Lightweight/Dynamic convolution results
 | |         | # Snt | # Wrd |Corr|Sub|Del|Ins|Err|S.Err |
diff --git a/egs/librispeech/asr1/conf/tuning/transducer/decode.yaml b/egs/librispeech/asr1/conf/tuning/transducer/decode.yaml
new file mode 100644
index 00000000000..021c9b058ca
--- /dev/null
+++ b/egs/librispeech/asr1/conf/tuning/transducer/decode.yaml
@@ -0,0 +1,4 @@
+batch: 0
+beam-size: 10
+search-type: default
+score-norm: True
diff --git a/egs/librispeech/asr1/conf/tuning/transducer/train_conformer-rnn_transducer.yaml b/egs/librispeech/asr1/conf/tuning/transducer/train_conformer-rnn_transducer.yaml
new file mode 100644
index 00000000000..41f4233b566
--- /dev/null
+++ b/egs/librispeech/asr1/conf/tuning/transducer/train_conformer-rnn_transducer.yaml
@@ -0,0 +1,50 @@
+# minibatch related
+batch-size: 32
+maxlen-in: 512
+maxlen-out: 150
+
+# optimization related
+criterion: loss
+early-stop-criterion: "validation/main/loss"
+sortagrad: 0
+opt: noam
+noam-adim: 256
+transformer-lr: 1.0
+transformer-warmup-steps: 25000
+epochs: 100
+patience: 0
+accum-grad: 4
+grad-clip: 5.0
+
+# network architecture
+## general
+custom-enc-positional-encoding-type: rel_pos
+custom-enc-self-attn-type: rel_self_attn
+custom-enc-pw-activation-type: swish
+## encoder related
+etype: custom
+custom-enc-input-layer: vgg2l
+enc-block-arch:
+        - type: conformer
+          d_hidden: 512
+          d_ff: 2048
+          heads: 4
+          macaron_style: True
+          use_conv_mod: True
+          conv_mod_kernel: 15
+          dropout-rate: 0.3
+          att-dropout-rate: 0.3
+enc-block-repeat: 12
+## decoder related
+dtype: lstm
+dlayers: 1
+dec-embed-dim: 1024
+dunits: 512
+dropout-rate-embed-decoder: 0.2
+dropout-rate-decoder: 0.1
+## joint network related
+joint-dim: 512
+
+# transducer related
+model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E"
+
diff --git a/egs/librispeech_100/asr1/RESULTS.md b/egs/librispeech_100/asr1/RESULTS.md
new file mode 100644
index 00000000000..a5c4499cc95
--- /dev/null
+++ b/egs/librispeech_100/asr1/RESULTS.md
@@ -0,0 +1,52 @@
+# Conformer-CTC
+  - Model files (archived to model.tar.gz by `$ pack_model.sh`)
+    - model link: (<https://drive.google.com/file/d/1w-GzALrVIbCNiMpGh3UajhvXpOGPMil_>)
+    - training config file: `conf/tuning/train_conformer_ctc.yaml`
+    - decoding config file: `conf/tuning/decode_ctc.yaml`
+    - cmvn file: `data/train_clean_100_sp/cmvn.ark`
+    - e2e file: `exp/train_clean_100_sp_pytorch_train_conformer_ctc_nbpe300_specaug/results/model.cer5.avg.best`
+    - e2e JSON file: `exp/train_clean_100_sp_pytorch_train_conformer_ctc_nbpe300_specaug/results/model.json`
+    - dict file: `data/lang_char`
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_dev_clean_model.cer5.avg.best_decode_ctc_nolm|2703|54402|93.3|6.1|0.6|0.7|7.4|62.6|
+|decode_dev_other_model.cer5.avg.best_decode_ctc_nolm|2864|50948|81.9|16.3|1.8|2.1|20.2|84.4|
+|decode_test_clean_model.cer5.avg.best_decode_ctc_nolm|2620|52576|93.1|6.3|0.6|0.8|7.7|63.5|
+|decode_test_other_model.cer5.avg.best_decode_ctc_nolm|2939|52343|81.3|16.5|2.1|2.0|20.6|85.2|
+
+# Conformer-CTC/Attention
+  - Model files (archived to model.tar.gz by `$ pack_model.sh`)
+    - model link: (<https://drive.google.com/file/d/1QTVqk4sPSdECjuqEjr7vP0ZG7PoLgIMc>)
+    - training config file: `conf/tuning/train_conformer_ctcatt.yaml`
+    - decoding config file: `conf/tuning/decode_ctcatt.yaml`
+    - cmvn file: `data/train_clean_100_sp/cmvn.ark`
+    - e2e file: `exp/train_clean_100_sp_pytorch_train_conformer_ctcatt_nbpe300_specaug/results/model.val5.avg.best`
+    - e2e JSON file: `exp/train_clean_100_sp_pytorch_train_conformer_ctcatt_nbpe300_specaug/results/model.json`
+    - dict file: `data/lang_char`
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_dev_clean_model.val5.avg.best_decode_ctcatt_cw0.3_nolm|2703|54402|93.7|5.3|1.0|0.9|7.2|56.9|
+|decode_dev_other_model.val5.avg.best_decode_ctcatt_cw0.3_nolm|2864|50948|83.5|14.6|1.9|2.0|18.5|81.5|
+|decode_test_clean_model.val5.avg.best_decode_ctcatt_cw0.3_nolm|2620|52576|93.4|5.5|1.1|0.7|7.3|57.7|
+|decode_test_other_model.val5.avg.best_decode_ctcatt_cw0.3_nolm|2939|52343|82.7|15.0|2.2|2.1|19.3|81.8|
+
+# Conformer-Transducer
+  - Model files (archived to model.tar.gz by `$ pack_model.sh`)
+    - model link: (<https://drive.google.com/file/d/1seIYIpMe2gVYM-bWbRbrrqjFuptr1jYb>)
+    - training config file: `conf/tuning/train_conformer_transducer.yaml`
+    - decoding config file: `conf/tuning/decode_transducer.yaml`
+    - cmvn file: `data/train_clean_100_sp/cmvn.ark`
+tar: Removing leading `/' from member names
+    - e2e file: `exp/train_clean_100_sp_pytorch_train_conformer_transducer_nbpe300_specaug/results/model.last10.avg.best`
+    - e2e JSON file: `exp/train_clean_100_sp_pytorch_train_conformer_transducer_nbpe300_specaug/results/model.json`
+tar: Removing leading `/' from member names
+    - dict file: `data/lang_char`
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_dev_clean_model.last10.avg.best_decode_transducer_nolm|2703|54402|93.4|5.9|0.7|0.7|7.3|61.6|
+|decode_dev_other_model.last10.avg.best_decode_transducer_nolm|2864|50948|82.1|15.6|2.3|1.9|19.9|84.0|
+|decode_test_clean_model.last10.avg.best_decode_transducer_nolm|2620|52576|93.0|6.1|0.9|0.8|7.8|63.6|
+|decode_test_other_model.last10.avg.best_decode_transducer_nolm|2939|52343|82.1|15.4|2.5|1.8|19.8|84.8|
diff --git a/egs/librispeech_100/asr1/cmd.sh b/egs/librispeech_100/asr1/cmd.sh
new file mode 100644
index 00000000000..7b70ef5e06e
--- /dev/null
+++ b/egs/librispeech_100/asr1/cmd.sh
@@ -0,0 +1,89 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/gpu.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs/librispeech_100/asr1/conf/fbank.conf b/egs/librispeech_100/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs/librispeech_100/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs/librispeech_100/asr1/conf/gpu.conf b/egs/librispeech_100/asr1/conf/gpu.conf
new file mode 100644
index 00000000000..6d0a75b067a
--- /dev/null
+++ b/egs/librispeech_100/asr1/conf/gpu.conf
@@ -0,0 +1,10 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0
+option gpu=* -l 'hostname=b1[12345678]*|c*,gpu=$0' -q g.q
\ No newline at end of file
diff --git a/egs/librispeech_100/asr1/conf/pitch.conf b/egs/librispeech_100/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs/librispeech_100/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs/librispeech_100/asr1/conf/queue.conf b/egs/librispeech_100/asr1/conf/queue.conf
new file mode 100644
index 00000000000..257d7b7b3aa
--- /dev/null
+++ b/egs/librispeech_100/asr1/conf/queue.conf
@@ -0,0 +1,10 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs/librispeech_100/asr1/conf/slurm.conf b/egs/librispeech_100/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs/librispeech_100/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs/librispeech_100/asr1/conf/specaug.yaml b/egs/librispeech_100/asr1/conf/specaug.yaml
new file mode 100644
index 00000000000..c0643d38597
--- /dev/null
+++ b/egs/librispeech_100/asr1/conf/specaug.yaml
@@ -0,0 +1,16 @@
+process:
+  # these three processes are a.k.a. SpecAugument
+  - type: "time_warp"
+    max_time_warp: 5
+    inplace: true
+    mode: "PIL"
+  - type: "freq_mask"
+    F: 30
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+  - type: "time_mask"
+    T: 40
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
diff --git a/egs/librispeech_100/asr1/conf/tuning/decode_ctc.yaml b/egs/librispeech_100/asr1/conf/tuning/decode_ctc.yaml
new file mode 100644
index 00000000000..04b74a5d1e7
--- /dev/null
+++ b/egs/librispeech_100/asr1/conf/tuning/decode_ctc.yaml
@@ -0,0 +1,9 @@
+# best path decoding
+# if you want to use beam-search decoding with an LM, use '--api v2'
+batchsize: 0
+beam-size: 1
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc-weight: 1.0
+lm-weight: 0.0
\ No newline at end of file
diff --git a/egs/librispeech_100/asr1/conf/tuning/decode_ctcatt.yaml b/egs/librispeech_100/asr1/conf/tuning/decode_ctcatt.yaml
new file mode 100644
index 00000000000..a74a2c7022e
--- /dev/null
+++ b/egs/librispeech_100/asr1/conf/tuning/decode_ctcatt.yaml
@@ -0,0 +1,7 @@
+batchsize: 0
+beam-size: 1
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc-weight: 0.3
+lm-weight: 0.0
\ No newline at end of file
diff --git a/egs/librispeech_100/asr1/conf/tuning/decode_transducer.yaml b/egs/librispeech_100/asr1/conf/tuning/decode_transducer.yaml
new file mode 100644
index 00000000000..df1067ce715
--- /dev/null
+++ b/egs/librispeech_100/asr1/conf/tuning/decode_transducer.yaml
@@ -0,0 +1,5 @@
+# decoding parameters
+batch: 0
+beam-size: 1
+search-type: default
+score-norm: True
\ No newline at end of file
diff --git a/egs/librispeech_100/asr1/conf/tuning/train_conformer_ctc.yaml b/egs/librispeech_100/asr1/conf/tuning/train_conformer_ctc.yaml
new file mode 100644
index 00000000000..1d66b4b3a36
--- /dev/null
+++ b/egs/librispeech_100/asr1/conf/tuning/train_conformer_ctc.yaml
@@ -0,0 +1,54 @@
+# network architecture
+# encoder related
+elayers: 18
+eunits: 1024
+# attention related
+adim: 256
+aheads: 4
+
+# hybrid CTC/attention
+mtlalpha: 1.0
+
+# label smoothing
+lsm-weight: 0.1
+
+# minibatch related
+batch-size: 32
+maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
+
+# optimization related
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+opt: noam
+accum-grad: 8
+grad-clip: 5
+patience: 0
+epochs: 100 # 70 epochs give reasonable results
+dropout-rate: 0.1
+
+# transformer specific setting
+backend: pytorch
+model-module: "espnet.nets.pytorch_backend.e2e_asr_conformer:E2E"
+transformer-input-layer: conv2d
+transformer-lr: 5.0
+transformer-warmup-steps: 25000
+transformer-attn-dropout-rate: 0.0
+transformer-length-normalized-loss: false
+transformer-init: pytorch
+
+# conformer specific setting
+transformer-encoder-pos-enc-layer-type: rel_pos
+transformer-encoder-selfattn-layer-type: rel_selfattn
+transformer-encoder-activation-type: swish
+macaron-style: true
+use-cnn-module: true
+cnn-module-kernel: 15
+rel-pos-type: latest
+
+# Report CER & WER
+report-cer: true # important for reporting cer_ctc
+report-wer: true
+
+# for visualization
+num-save-attention: 0
+num-save-ctc: 0
\ No newline at end of file
diff --git a/egs/librispeech_100/asr1/conf/tuning/train_conformer_ctcatt.yaml b/egs/librispeech_100/asr1/conf/tuning/train_conformer_ctcatt.yaml
new file mode 100644
index 00000000000..465cbc4fc65
--- /dev/null
+++ b/egs/librispeech_100/asr1/conf/tuning/train_conformer_ctcatt.yaml
@@ -0,0 +1,53 @@
+# network architecture
+# encoder related
+elayers: 12
+eunits: 1024
+# decoder related
+dlayers: 6
+dunits: 2048
+# attention related
+adim: 256
+aheads: 4
+
+# hybrid CTC/attention
+mtlalpha: 0.3
+
+# label smoothing
+lsm-weight: 0.1
+
+# minibatch related
+batch-size: 32
+maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
+
+# optimization related
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+opt: noam
+accum-grad: 8
+grad-clip: 5
+patience: 0
+epochs: 100
+dropout-rate: 0.1
+
+# transformer specific setting
+backend: pytorch
+model-module: "espnet.nets.pytorch_backend.e2e_asr_conformer:E2E"
+transformer-input-layer: conv2d     # encoder architecture type
+transformer-lr: 5.0
+transformer-warmup-steps: 25000
+transformer-attn-dropout-rate: 0.0
+transformer-length-normalized-loss: false
+transformer-init: pytorch
+
+# conformer specific setting
+transformer-encoder-pos-enc-layer-type: rel_pos
+transformer-encoder-selfattn-layer-type: rel_selfattn
+transformer-encoder-activation-type: swish
+macaron-style: true
+use-cnn-module: true
+cnn-module-kernel: 15
+rel-pos-type: latest
+
+# for visualization
+num-save-attention: 0
+num-save-ctc: 0
\ No newline at end of file
diff --git a/egs/librispeech_100/asr1/conf/tuning/train_conformer_transducer.yaml b/egs/librispeech_100/asr1/conf/tuning/train_conformer_transducer.yaml
new file mode 100644
index 00000000000..3ed6a6287cb
--- /dev/null
+++ b/egs/librispeech_100/asr1/conf/tuning/train_conformer_transducer.yaml
@@ -0,0 +1,52 @@
+# minibatch related
+batch-size: 16
+maxlen-in: 512
+maxlen-out: 150
+
+# optimization related
+criterion: loss
+early-stop-criterion: "validation/main/loss"
+sortagrad: 0
+opt: noam
+transformer-lr: 5.0
+transformer-warmup-steps: 25000
+epochs: 100 # # 70 epochs give reasonable results
+patience: 0
+accum-grad: 8
+grad-clip: 5.0
+
+# network architecture
+## general
+custom-enc-positional-encoding-type: rel_pos
+custom-enc-self-attn-type: rel_self_attn
+custom-enc-pw-activation-type: swish
+## encoder related
+etype: custom
+custom-enc-input-layer: vgg2l
+custom-enc-input-dropout-rate: 0.3
+enc-block-arch:
+        - type: conformer
+          d_hidden: 256
+          d_ff: 1024
+          heads: 4
+          macaron_style: True
+          use_conv_mod: True
+          conv_mod_kernel: 15
+          dropout-rate: 0.3
+          att-dropout-rate: 0.3
+enc-block-repeat: 18
+## decoder related
+dtype: lstm
+dlayers: 1
+dec-embed-dim: 300
+dunits: 300
+dropout-rate-embed-decoder: 0.2
+dropout-rate-decoder: 0.1
+## joint network related
+joint-dim: 300
+
+# transducer related
+model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E"
+
+num-save-attention: 0
+num-save-ctc: 0
diff --git a/egs/librispeech_100/asr1/local b/egs/librispeech_100/asr1/local
new file mode 120000
index 00000000000..ed938bae393
--- /dev/null
+++ b/egs/librispeech_100/asr1/local
@@ -0,0 +1 @@
+../../librispeech/asr1/local
\ No newline at end of file
diff --git a/egs/librispeech_100/asr1/path.sh b/egs/librispeech_100/asr1/path.sh
new file mode 100644
index 00000000000..8d773b5ffee
--- /dev/null
+++ b/egs/librispeech_100/asr1/path.sh
@@ -0,0 +1,15 @@
+MAIN_ROOT=$PWD/../../..
+KALDI_ROOT=$MAIN_ROOT/tools/kaldi
+
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$MAIN_ROOT/tools/chainer_ctc/ext/warp-ctc/build
+. "${MAIN_ROOT}"/tools/activate_python.sh && . "${MAIN_ROOT}"/tools/extra_path.sh
+export PATH=$MAIN_ROOT/utils:$MAIN_ROOT/espnet/bin:$PATH
+
+export OMP_NUM_THREADS=1
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
diff --git a/egs/librispeech_100/asr1/run.sh b/egs/librispeech_100/asr1/run.sh
new file mode 100644
index 00000000000..e031e8e0a63
--- /dev/null
+++ b/egs/librispeech_100/asr1/run.sh
@@ -0,0 +1,300 @@
+#!/usr/bin/env bash
+
+# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+
+# general configuration
+backend=pytorch
+stage=-1       # start from -1 if you need to start from data download
+stop_stage=100
+ngpu=1         # number of gpus ("0" uses cpu, otherwise use gpu)
+nj=32
+debugmode=1
+dumpdir=dump   # directory to dump full features
+N=0            # number of minibatches to be used (mainly for debugging). "0" uses all minibatches.
+verbose=0      # verbose option
+resume=        # Resume the training from snapshot
+
+# feature configuration
+do_delta=false
+
+preprocess_config=conf/specaug.yaml
+train_config=
+lm_config=
+decode_config=
+
+# rnnlm related
+skip_lm_training=true  # for training & decoding without LM
+lm_resume=             # specify a snapshot file to resume LM training
+lmtag=                 # tag for managing LMs
+
+# decoding parameter
+recog_model=model.acc.best  # set a model to be used for decoding: 'model.acc.best' or 'model.loss.best'
+lang_model=rnnlm.model.best # set a language model to be used for decoding
+
+# model average realted (only for transformer)
+n_average=5                  # the number of ASR models to be averaged
+use_valbest_average=false    # if true, models with top-`n_average` validation/loss are averaged
+use_cerbest_average=false    # if true, models with top-`n_average` validation/cer_cer are averaged
+                             # if both use_{valbest,cerbest}_average are false, last `n_average` are averaged
+
+# Set this to somewhere where you want to put your data, or where
+# someone else has already put it.  You'll want to change this
+# if you're not on the CLSP grid.
+datadir=/mnt/aoni04/higuchi/data
+
+# base url for downloads.
+data_url=www.openslr.org/resources/12
+
+# bpemode (unigram or bpe)
+nbpe=300
+bpemode=unigram
+
+# exp tag
+tag="" # tag for managing experiments.
+
+. utils/parse_options.sh || exit 1;
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train_clean_100_sp # train_clean_100 or train_clean_100_sp
+train_dev=dev
+recog_set="test_clean test_other dev_clean dev_other"
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    echo "stage -1: Data Download"
+    for part in dev-clean test-clean dev-other test-other train-clean-100; do
+        local/download_and_untar.sh ${datadir} ${data_url} ${part}
+    done
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    ### Task dependent. You have to make data the following preparation part by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+    echo "stage 0: Data preparation"
+    for part in dev-clean test-clean dev-other test-other train-clean-100; do
+        # use underscore-separated names in data directories.
+        local/data_prep.sh ${datadir}/LibriSpeech/${part} data/${part//-/_}
+    done
+fi
+
+feat_tr_dir=${dumpdir}/${train_set}/delta${do_delta}; mkdir -p ${feat_tr_dir}
+feat_dt_dir=${dumpdir}/${train_dev}/delta${do_delta}; mkdir -p ${feat_dt_dir}
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    echo "stage 1: Feature Generation"
+    fbankdir=fbank
+
+    # Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame
+    for x in dev_clean test_clean dev_other test_other train_clean_100; do
+        steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj ${nj} --write_utt2num_frames true \
+            data/${x} exp/make_fbank/${x} ${fbankdir}
+        utils/fix_data_dir.sh data/${x}
+    done
+
+    # speed perturbation
+    mv data/train_clean_100 data/train_clean_100_org
+    utils/perturb_data_dir_speed.sh 0.9  data/train_clean_100_org  data/temp1
+    utils/perturb_data_dir_speed.sh 1.0  data/train_clean_100_org  data/temp2
+    utils/perturb_data_dir_speed.sh 1.1  data/train_clean_100_org  data/temp3
+    utils/combine_data.sh --extra-files utt2uniq data/train_clean_100_sp_org data/temp1 data/temp2 data/temp3
+
+    # create dev set
+    utils/combine_data.sh --extra_files utt2num_frames data/dev_org data/dev_clean data/dev_other
+
+    # remove utt having more than 3000 frames
+    # remove utt having more than 400 characters
+    remove_longshortdata.sh --maxframes 3000 --maxchars 400 data/train_clean_100_org data/train_clean_100
+    remove_longshortdata.sh --maxframes 3000 --maxchars 400 data/train_clean_100_sp_org data/train_clean_100_sp
+    remove_longshortdata.sh --maxframes 3000 --maxchars 400 data/dev_org data/dev
+    steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj $nj  --write_utt2num_frames true \
+            data/train_clean_100_sp  exp/make_fbank/train_clean_100_sp  ${fbankdir}
+    utils/fix_data_dir.sh data/train_clean_100_sp
+    # compute global CMVN
+    compute-cmvn-stats scp:data/${train_set}/feats.scp data/${train_set}/cmvn.ark
+
+    # dump features for training
+    dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta ${do_delta} \
+        data/${train_set}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/train ${feat_tr_dir}
+    dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta ${do_delta} \
+        data/${train_dev}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/dev ${feat_dt_dir}
+    for rtask in ${recog_set}; do
+        feat_recog_dir=${dumpdir}/${rtask}/delta${do_delta}; mkdir -p ${feat_recog_dir}
+        dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta ${do_delta} \
+            data/${rtask}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/recog/${rtask} \
+            ${feat_recog_dir}
+    done
+fi
+
+dict=data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt
+bpemodel=data/lang_char/${train_set}_${bpemode}${nbpe}
+echo "dictionary: ${dict}"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    ### Task dependent. You have to check non-linguistic symbols used in the corpus.
+    echo "stage 2: Dictionary and Json Data Preparation"
+    mkdir -p data/lang_char/
+    echo "<unk> 1" > ${dict} # <unk> must be 1, 0 will be used for "blank" in CTC
+    cut -f 2- -d" " data/${train_set}/text > data/lang_char/input.txt
+    spm_train --input=data/lang_char/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000
+    spm_encode --model=${bpemodel}.model --output_format=piece < data/lang_char/input.txt | tr ' ' '\n' | sort | uniq | awk '{print $0 " " NR+1}' >> ${dict}
+    wc -l ${dict}
+
+    # make json labels
+    data2json.sh --nj ${nj} --feat ${feat_tr_dir}/feats.scp --bpecode ${bpemodel}.model \
+        data/${train_set} ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.json
+    data2json.sh --nj ${nj} --feat ${feat_dt_dir}/feats.scp --bpecode ${bpemodel}.model \
+        data/${train_dev} ${dict} > ${feat_dt_dir}/data_${bpemode}${nbpe}.json
+
+    for rtask in ${recog_set}; do
+        feat_recog_dir=${dumpdir}/${rtask}/delta${do_delta}
+        data2json.sh --nj ${nj} --feat ${feat_recog_dir}/feats.scp --bpecode ${bpemodel}.model \
+            data/${rtask} ${dict} > ${feat_recog_dir}/data_${bpemode}${nbpe}.json
+    done
+fi
+
+if [ -z ${lmtag} ] && ! ${skip_lm_training}; then
+    lmtag=$(basename ${lm_config%.*})
+fi
+# you can skip LM training by setting skip_lm_training=true
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ] && ! ${skip_lm_training}; then
+    echo "stage 3: LM Preparation"
+    lmexpname=train_rnnlm_${backend}_${lmtag}_${bpemode}${nbpe}_ngpu${ngpu}
+    lmexpdir=exp/${lmexpname}
+    mkdir -p ${lmexpdir}
+
+    lmdatadir=data/local/lm_train_${bpemode}${nbpe}
+    # use external data
+    if [ ! -e data/local/lm_train/librispeech-lm-norm.txt.gz ]; then
+        wget http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz -P data/local/lm_train/
+    fi
+    if [ ! -e ${lmdatadir} ]; then
+        mkdir -p ${lmdatadir}
+        cut -f 2- -d" " data/${train_set}/text | gzip -c > data/local/lm_train/${train_set}_text.gz
+        # combine external text and transcriptions and shuffle them with seed 777
+        zcat data/local/lm_train/librispeech-lm-norm.txt.gz data/local/lm_train/${train_set}_text.gz |\
+            spm_encode --model=${bpemodel}.model --output_format=piece > ${lmdatadir}/train.txt
+        cut -f 2- -d" " data/${train_dev}/text | spm_encode --model=${bpemodel}.model --output_format=piece \
+                                                            > ${lmdatadir}/valid.txt
+    fi
+    ${cuda_cmd} --gpu ${ngpu} ${lmexpdir}/train.log \
+        lm_train.py \
+        --config ${lm_config} \
+        --ngpu ${ngpu} \
+        --backend ${backend} \
+        --verbose 1 \
+        --outdir ${lmexpdir} \
+        --tensorboard-dir tensorboard/${lmexpname} \
+        --train-label ${lmdatadir}/train.txt \
+        --valid-label ${lmdatadir}/valid.txt \
+        --resume ${lm_resume} \
+        --dict ${dict} \
+        --dump-hdf5-path ${lmdatadir}
+fi
+
+if [ -z ${tag} ]; then
+    expname=${train_set}_${backend}_$(basename ${train_config%.*})
+    if ${do_delta}; then
+        expname=${expname}_delta
+    fi
+    if [ -n "${preprocess_config}" ]; then
+        expname=${expname}_$(basename ${preprocess_config%.*})
+    fi
+else
+    expname=${train_set}_${backend}_${tag}
+fi
+expdir=exp/${expname}
+mkdir -p ${expdir}
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    echo "stage 4: Network Training"
+    ${cuda_cmd} --gpu ${ngpu} ${expdir}/train.log \
+        asr_train.py \
+        --config ${train_config} \
+        --preprocess-conf ${preprocess_config} \
+        --ngpu ${ngpu} \
+        --backend ${backend} \
+        --outdir ${expdir}/results \
+        --tensorboard-dir tensorboard/${expname} \
+        --debugmode ${debugmode} \
+        --dict ${dict} \
+        --debugdir ${expdir} \
+        --minibatches ${N} \
+        --verbose ${verbose} \
+        --resume ${resume} \
+        --train-json ${feat_tr_dir}/data_${bpemode}${nbpe}.json \
+        --valid-json ${feat_dt_dir}/data_${bpemode}${nbpe}.json
+fi
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    echo "stage 5: Decoding"
+    if [[ $(get_yaml.py ${train_config} model-module) = *transformer* ]] || \
+           [[ $(get_yaml.py ${train_config} model-module) = *conformer* ]] || \
+           [[ $(get_yaml.py ${train_config} etype) = custom ]] || \
+           [[ $(get_yaml.py ${train_config} dtype) = custom ]]; then
+        # Average ASR models
+        if ${use_valbest_average}; then
+            recog_model=model.val${n_average}.avg.best
+            opt="--log ${expdir}/results/log"
+        elif ${use_cerbest_average}; then
+            recog_model=model.cer${n_average}.avg.best
+            opt="--log ${expdir}/results/log --metric cer_ctc"
+        else
+            recog_model=model.last${n_average}.avg.best
+            opt="--log"
+        fi
+        average_checkpoints.py \
+            ${opt} \
+            --backend ${backend} \
+            --snapshots ${expdir}/results/snapshot.ep.* \
+            --out ${expdir}/results/${recog_model} \
+            --num ${n_average}
+
+        # LM option
+        recog_opts=
+        if ${skip_lm_training}; then
+            lmtag="nolm"
+        else
+            recog_opts="--rnnlm ${lmexpdir}/${lang_model}"
+        fi
+    fi
+
+    pids=() # initialize pids
+    for rtask in ${recog_set}; do
+    (
+        decode_dir=decode_${rtask}_${recog_model}_$(basename ${decode_config%.*})_${lmtag}
+        feat_recog_dir=${dumpdir}/${rtask}/delta${do_delta}
+
+        # split data
+        splitjson.py --parts ${nj} ${feat_recog_dir}/data_${bpemode}${nbpe}.json
+
+        #### use CPU for decoding
+        ngpu=0
+
+        # set batchsize 0 to disable batch decoding
+        ${decode_cmd} JOB=1:${nj} ${expdir}/${decode_dir}/log/decode.JOB.log \
+            asr_recog.py \
+            --config ${decode_config} \
+            --ngpu ${ngpu} \
+            --backend ${backend} \
+            --batchsize 0 \
+            --recog-json ${feat_recog_dir}/split${nj}utt/data_${bpemode}${nbpe}.JOB.json \
+            --result-label ${expdir}/${decode_dir}/data.JOB.json \
+            --model ${expdir}/results/${recog_model}  \
+            ${recog_opts}
+
+
+        score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel}.model --wer true ${expdir}/${decode_dir} ${dict}
+
+    ) &
+    pids+=($!) # store background pids
+    done
+    i=0; for pid in "${pids[@]}"; do wait ${pid} || ((++i)); done
+    [ ${i} -gt 0 ] && echo "$0: ${i} background jobs are failed." && false
+    echo "Finished"
+fi
diff --git a/egs/librispeech_100/asr1/steps b/egs/librispeech_100/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs/librispeech_100/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs/librispeech_100/asr1/utils b/egs/librispeech_100/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs/librispeech_100/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs/lrs/README.md b/egs/lrs/README.md
new file mode 100644
index 00000000000..26f623cd08b
--- /dev/null
+++ b/egs/lrs/README.md
@@ -0,0 +1,335 @@
+# ESPnet-AVSR
+
+## Introduction
+This repository contains an implementation of end-to-end (E2E) audio-visual speech recognition (AVSR) based on the ESPnet ASR toolkit. The new fusion strategy follows the paper "Fusing information streams in end-to-end audio-visual speech recognition." (https://ieeexplore.ieee.org/document/9414553) [[1]](#literature). A broad range of reliability measures are used to help the integration model improve the performance of the AVSR model. We use two large-vocabulary datasets, the Lip Reading Sentences 2 and 3 corpora for all our experiments.
+In addition, this project also contains an audio-only model for comparison.
+
+## Table of Contents 
+- [Installation](#installation-of-required-packages)
+  * [Requirements](#requirements)
+- [Project Structure](#project-structure)
+  * [Basics](#project-structure)
+  * [AVSR1](#detailed-description-of-avsr1)
+- [Usage of the scripts](#running-the-script)
+  + [Notes](#notes)
+
+
+## Installation of required packages
+
+### Requirements
+
+For installation, approximately 40GB of free disk space is needed. avsr1/run.sh stage 0 installs all required packages in avsr1/local/installations:
+    
+**Required Packages:**
+1. ESPNet: https://github.com/espnet/espnet
+1. OpenFace: https://github.com/TadasBaltrusaitis/OpenFace
+2. DeepXi: https://github.com/anicolson/DeepXi
+3. Vidaug: https://github.com/okankop/vidaug
+
+<!-- **Prerequirements:**
+The following packages needs to be installed in advance to be able to run the scripts:
+1. Git: 
+```console 
+foo@bar:~$ sudo apt-get install git
+```
+2. Python-venv package:
+```console 
+foo@bar:~$ sudo apt-get install python3-venv
+```
+3. Python-dev packages (development package):
+```console 
+foo@bar:~$ sudo apt-get install python3-dev # for python 3.+
+foo@bar:~$ sudo apt-get install python-dev # for python 2.+ (optional)
+```
+### Files
+The following shell scripts in the installation directory (<code>install/</code>) are important 
+ * [install_espnet.sh](install/install_espnet.sh): Script to install ESPnet. For more information see [Option 1](#option-1-installation-scripts-for-every-package-preferred).
+ * [install_openface.sh](install/install_openface.sh): Script to install OpenFace. For more information see [Option 1](#option-1-installation-scripts-for-every-package-preferred).
+ * [install_deepxi.sh](install/install_deepxi.sh): Script to install DeepXi. For more information see [Option 1](#option-1-installation-scripts-for-every-package-preferred).
+ * [install_videoaug.sh](install/install_videoaug.sh): Script to install Videoaug. For more information see [Option 1](#option-1-installation-scripts-for-every-package-preferred).
+ * [install_avsr.sh](install/install_avsr.sh): Script to install all packages. For more information see [Option 2](#option-2-installation-script-for-all-packages-not-preferred). 
+
+### Installation
+The script install_avsr.sh installs all the requirements (OpenFace, DeepXi, ffmpeg). Be careful in using the script, it was only tested on Ubuntu 20.04 but should also work on Ubuntu 18.04. We assume no liability for damage to your equipment. If any of the requirements are already installed, please install the other packages manually by following the provided links or follow the procedure for this specific package in the installation script. It was tried to build the script so that the Software OpenFace, DeepXI, and ffmpeg can be installed separately. 
+Furthermore, if there is an error, the installation stops and you need to follow the error messages to fix them. Some common error sources with mitigation are listed in the section [Common Errors](#common-errors).
+
+>However, the **preferred option is to run each of the scripts** (`install_openface.sh`, `install_deepxi.sh`) **separately** ([Option 1](#option-1-installation-scripts-for-every-package-preferred)). In this case it is easier to react in case of errors.
+
+#### Option 1: Installation scripts for every package (preferred)
+1. Install ESPnet. During installation, the user must specify whether CUDA is installed and whether CUDA should be used. The script was written for CUDA 10.0, assuming the default installation path (`/usr/local/cuda-10.0`) for CUDA 10.0. **If the CUDA installation path is different or a different CUDA version should be used, the variable CUDA_PATH must be adjusted in the file install_espnet.sh.** Run the script using simple bash command. 
+```console
+foo@bar:~/install$ bash install_espnet.sh
+```
+2. Install OpenFace
+```console
+foo@bar:~/install$ bash install_openface.sh
+```
+If there are problems regarding cmake, please refer to [CMake errors](#cmake-version-not-supported) for further instructions.
+
+3. Install DeepXi
+```console
+foo@bar:~/install$ bash install_deepxi.sh
+```
+4. Install Videoaug (ESPnet must be installed in advance because Videoaug is installed in ESPnet environment)
+```console
+foo@bar:~/install$ bash install_deepxi.sh
+```
+
+#### Option 2: Installation script for all packages (not preferred)
+To install all packages at once, run the command
+```console
+foo@bar:~/install$ bash install_avsr.sh
+```
+You can select which packages should be installed while running the bash script.
+If there are problems regarding CMake, please refer to [CMake errors](#cmake-version-not-supported) for further instructions.
+
+### Common Errors
+#### CMake version not supported
+##### Using provided script for Installing CMake 3.10.2
+Some errors are associated with a non matching CMake version. The minimum required version is CMake 3.10.2. The script avsr_install.sh can install CMake 3.10.2 with the following command automatically and uses this version for the current session:
+```console
+foo@bar:~/install$ bash install_avsr.sh INSTALL_CMAKE # or for openface script: bash install_openface.sh INSTALL_CMAKE 
+```
+To install the CMake version 3.10.2 and use it as your standard CMake version, please use the script with the option:
+```console
+foo@bar:~/install$ bash install_avsr.sh INSTALL_CMAKE_PERMANENT # or for openface script: bash install_openface.sh INSTALL_CMAKE_PERMANENT
+```
+##### Using arbitrary CMake versions
+If you want to install CMake manually for an arbirtray version, please perform the following steps. Be carefull to replace the version used here (3.10.2) with the correct CMake version you want to install.
+###### 1. Open a terminal
+###### 2. Download and unpack files: 
+Enter the following command to download the source code. Replace with wanted version. 
+Here Version 3.10.2 is downloaded.
+```console
+foo@bar:~$ wget https://github.com/Kitware/CMake/releases/download/v3.10.2/cmake-3.10.2.tar.gz
+foo@bar:~$ tar -zxvf cmake-3.10.2.tar.gz 
+foo@bar:~$ rm cmake-3.10.2.tar.gz
+```
+###### 3. Installation
+Execute the following steps to perform the installation:
+```console
+foo@bar:~$ cd cmake-3.10.2
+foo@bar:~/cmake-3.10.2$ ./bootstrap
+foo@bar:~/cmake-3.10.2$ make
+foo@bar:~/cmake-3.10.2$ sudo make install
+foo@bar:~/cmake-3.10.2$ cd ..
+```
+###### 4. Add to path (temporal/permanently)
+If you want to add the path temporal for this terminal session, execute:
+```console
+foo@bar:~$ export PATH="`pwd`/cmake-3.10.2/bin:$PATH"
+```
+
+Otherwise, if this CMake version should be the standard CMake version, the .bashrc file needs to be edited. In your home directory open a terminal:
+```console
+foo@home:~$
+```
+Open the .bashrc file with a text editor, e.g. nano:
+```console
+foo@home:~$ sudo nano .bashrc
+```
+Add the following line to the .bashrc file (replace path-to-cmake with the CMake installation path, e.g. /home/foo/cmake-3.10.2):
+```console
+foo@home:~$ export PATH="/path-to-cmake/bin:$PATH"
+```
+Save and close the file. For updating purposes, reload the .bashrc settings, execute:
+```console
+foo@home:~$ source ~/.bashrc
+```
+
+**The current CMake version used can always be checked with the command:**
+The current version used can also be checked with
+```console
+foo@home:~$ cmake --version
+```
+#### GCC or G++ Error
+Some packages requires a specific GCC or G++ version. To install and use multiple GCC or G++ versions, open a terminal and execute:
+```console
+foo@home:~$ sudo apt-get install gcc-7 g++-7 gcc-8 g++-8 gcc-9 g++-9
+foo@home:~$ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-7 7
+foo@home:~$ sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-7 7
+foo@home:~$ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-8 8
+foo@home:~$ sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-8 8
+foo@home:~$ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 9
+foo@home:~$ sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-9 9
+```
+This installs different gcc versions and creates a list of multiple GCC and G++ compiler version.
+To check and select the available versions, for the GCC compiler run
+```console
+foo@home:~$ sudo update-alternatives --config gcc
+```
+and for the G++ compiler run
+```console
+foo@home:~$ sudo update-alternatives --config g++
+```
+The current version used can also be checked with
+```console
+foo@home:~$ gcc --version
+foo@home:~$ g++ --version
+```
+Thanks to: https://linuxconfig.org/how-to-switch-between-multiple-gcc-and-g-compiler-versions-on-ubuntu-20-04-lts-focal-fossa
+
+#### DeepXi getting stuck at grpcio
+While installing DeepXi, it could be possible that the tensorflow installation is getting stuck in the process 
+```console
+Running setup.py bdist_wheel for grpcio
+```
+This might be due to the fact that the pip version needs to be upgraded. To upgrade pip, please activate the DeepXi environment:
+```console
+foo@bar:~$ source ~/venv/DeepXi/bin/activate
+(DeepXi) foo@bar: pip3 install --upgrade pip
+(DeepXi) foo@bar: deactivate
+```
+Now, rerun the DeepXi installation procedure (e.g. via install_deepxi.sh script)-->
+
+## Project structure
+The main folder <code>avsr1/</code>, contains the code for the audio-visual speech recognition system, also trained on the LRS2 [[2]](#literature) dataset together with the LRS3 dataset (https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs3.html) [[3]](#literature). It follows the basic ESPnet structure. 
+The main code for the recognition system is the <code>run.sh</code> script. In the script, the workflow of the systems is performed in multiple stages:
+
+|                                  AVSR                       |
+|-------------------------------------------------------------|
+| Stage 0: Install required packages                     |
+| Stage 1: Data Download and preparation                     |
+| Stage 2: Audio augmentation                                 | 
+| Stage 3: MP3 files and Feature Generation                   |
+| Stage 4: Dictionary and JSON data preparation               | 
+| Stage 5: Reliability measures generation                    |
+| Stage 6: Language model trainin                             |
+| Stage 7: Training of the E2E-AVSR model and Decoding        |
+
+
+<!--The folder structure for both systems is basically:
+* <code>conf/</code>: contains configuration files for the training, decoding, and feature extraction 
+* <code>data/</code>: directory for storing data
+* <code>exp/</code>: log files, model parameters, training results
+* <code>fbank/</code>: speech feature binary files, e.g., ark, scp
+* <code>dump*/</code> : ESPnet meta data for tranining, e.g., json, hdf5 
+* <code>local/</code>: Contains local runtime scripts for data processing, data augmentation and own written functions (e.g. face recognition in the AVSR system) that are not part of the ESPnet standard processing scripts. During the training stage, a symbolic link is built to the ESPnet. After training, the link will be deleted.  
+* <code>steps/</code>: helper scripts from ESPnet (Kaldi)
+* <code>utils/</code>: helper scripts from ESPnet (Kaldi) -->
+  
+<!-- ### Detailed description of ASR1:
+##### Stage -1: Data Download
+  * Strictly considered not a separate stage, since the data set must be downloaded in advance by yourself. For downloading the dataset, please visit 'https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html/' [[2]](#literature)
+  * You will need to sign a data sharing agreement with BBC Research & Development before getting access
+  * After downloading, please edit <code>path.sh</code> file and assign the dataset directory path to the <code>DATA_DIR</code> variable
+ 
+##### Stage 0: Data Preparation in Kaldi-Style
+  * For every dataset part (pretrain, train, test, validate), prepare the data in Kaldi-Style
+  * More information about Kaldi-Style: https://kaldi-asr.org/doc/data_prep.html
+  * Segmentation: If the variable <code>segment</code> is true, the data in the pretrain set will be segmented into files with length of 5s to restrict the length of the data
+  * Generates the text, utt2spk and wav.scp files 
+
+##### Stage 1: Feature Generation
+  * Generate the fillter bank features, by default 80-dimensional filter banks with pitch on each frame
+  * Cepstral mean and variance normalization
+
+##### Stage 2: Dictionary and JSON data preparation
+  * prepare a dictionary and save the data prepared in the previous steps as .json files
+  * If a pretrained language model is used, the dictionary data is replaced
+
+##### Stage 3: Language Model Trainingg
+  * Train your own language model on the librispeech dataset (https://www.openslr.org/11/) or use a pretrained language model
+  * It is possible to skip the language model and use the system without an external language model. For this, just remove the rnnlm from the decoding stage (5)
+
+##### Stage 4: Training
+  * Training of the ASR E2E system by using pretrain and train set
+
+##### Stage 5: Decoding
+  * Decoding of the test and validation set-->
+  
+### Detailed description of AVSR1:
+
+##### Stage 0: Packages installations
+  * Install the required packages: ESPNet, OpenFace, DeepXi, Vidaug in avsr1/local/installations. To install OpenFace, you will need sudo right.
+
+##### Stage 1: Data preparation
+  * The data set LRS2 [2] must be downloaded in advance by yourself. For downloading the dataset, please visit https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html/ [2]. You will need to sign a data-sharing agreement with BBC Research & Development before getting access. After downloading, please edit <code>path.sh</code> file and assign the dataset directory path to the <code>DATA_DIR</code> variable
+  * The same applies to the LRS3 dataset https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs3.html [3]. After downloading, please edit <code>path.sh</code> file and assign the dataset directory path to the <code>DATALRS3_DIR</code> variable
+  * Download the Musan dataset for audio data augmentation and save it under <code>${MUSAN_DIR}</code> directory
+  * Download Room Impulse Response and Noise Database (RIRS-Noises) and save it under <code>RIRS_NOISES/</code> directory
+  * Run <code>audio_data_prep.sh</code> script: Create file lists for the given part of the Dataset, prepare the Kaldi files
+  * Dump useful data for training 
+  
+##### Stage 2: Audio Augmentation
+  * Augment the audio data with RIRS Noise
+  * Augment the audio data with Musan Noise
+  * The augmented files are saved under data/audio/augment whereas the clear audio files can be found in data/audio/clear for all the used datasets (Test, Validation(Val), Train and optional Pretrain)
+  
+##### Stage 3: Feature Generation
+  * Make augmented MP3 files
+  * Generate the fbank and mfcc features for the audio signals. By default, 80-dimensional filterbanks with pitch on each frame are used
+  * Compute global Cepstral mean and variance normalization (CMVN). This computes goodness of pronunciation (GOP) and extracts phone-level pronunciation features for mispronunciations detection tasks (https://kaldi-asr.org/doc/compute-cmvn-stats_8cc.html).
+  
+##### Stage 4: Dictionary and JSON data preparation
+  * Build Dictionary and JSON Data Preparation
+  * Build a tokenizer using Sentencepiece: https://github.com/google/sentencepiece
+
+##### Stage 5: Reliability measures generation
+  * Stage 5.0: Creat dump file for MFCC features
+  * Stage 5.1: Video augmentation with Gaussian blur and salt&pepper noise
+  * Stage 5.2: OpenFace face recognition for facial recognition (especially the mouth region, for further details see documentation in avsr1/local folder )
+  * Stage 5.3: Extract video frames
+  * Stage 5.4: Estimate SNRs using DeepXi framework
+  * Stage 5.5: Extract video features by pretrained video feature extractor [[4]](#literature)
+  * Stage 5.6: Make video .ark files
+  * Stage 5.7: Remake audio and video dump files
+  * Stage 5.8: Split test decode dump files by different signal-to-noise ratios
+  
+##### Stage 6: Language Model Training
+  * Train your own language model on the librispeech dataset (https://www.openslr.org/11/) or use a pretrained language model
+  * It is possible to skip the language model and use the system without an external language model. 
+  
+##### Stage 7: Network Training
+  * Train audio model
+  * Pretrain video model
+  * Finetune video model
+  * Pretrain av model
+  * Finetune av model (model used for decoding)
+  
+##### Other important references:
+  * Explanation of the CSV-file for OpenFace: https://github.com/TadasBaltrusaitis/OpenFace/wiki/Output-Format#featureextraction
+
+
+## Running the script 
+The runtime script is the script **run.sh**. It can be found in <code>avsr1/</code> directory.
+> Before running the script, please download the LRS2 (https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html) [[2]](#literature) and LRS3 (https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs3.html) [[3]](#literature) datasets by yourself and save the download paths to the variables <code>DATA_DIR</code> (LRS2 path) and <code>DATALRS3_DIR</code> (LRS3 path) inside <code>run.sh</code> file.
+  
+### Notes
+Due to the long runtime, it could be useful to run the script using screen command in combination with monitoring in a terminal window and also redirect the output to a log file. 
+
+Screen is a terminal multiplexer which means that you can start any number of virtual terminals inside the current terminal session. The advantage is, that you can detach virtual terminals so that they are running in the background. Furthermore, the processes keep still running, even if you are closing the main session or close an ssh connection if you are working remote on a server.
+Screen can be installed from the official package repositories via
+```console
+foo@bar:~$ sudo apt install screen
+```
+As an example, to redirect the output into a file named "log_run_sh.txt", the script could be started with:
+```console
+foo@bar:~/avsr1$ screen bash -c 'bash run.sh |& tee -a log_run_sh.txt'
+```
+This will start a virtual terminal session, which is executing and monitoring the run.sh file. The output is printed to this session as well as saved into the file "log_run_sh.txt". You can leave the monitoring session by simply pressing <code>ctrl+A+D</code>. If you want to return to the process, simply type 
+```console
+foo@bar:~$ screen -ls
+```
+into a terminal to see all running screen processes with their corresponding ID. Then execute
+```console
+foo@bar:~$ screen -r [ID]
+```
+to return to the process.
+Source: https://wiki.ubuntuusers.de/Screen/
+
+***
+### Literature
+
+[1] W. Yu, S. Zeiler and D. Kolossa, "Fusing Information Streams in End-to-End Audio-Visual Speech Recognition," ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), 2021, pp. 3430-3434, doi: 10.1109/ICASSP39728.2021.9414553.
+
+[2] T. Afouras, J. S. Chung, A. Senior, O. Vinyals, A. Zisserman <br>
+Deep Audio-Visual Speech Recognition  
+arXiv: 1809.02108
+
+[3] T. Afouras, J. S. Chung, A. Zisserman <br>
+LRS3-TED: a large-scale dataset for visual speech recognition  
+arXiv preprint arXiv: 1809.00496 
+
+[4] S.  Petridis,   T.  Stafylakis,   P.  Ma,   G.  Tzimiropoulos,   andM.  Pantic,    “Audio-visual  speech  recognition  with  a  hybridCTC/Attention architecture,”   in IEEE SLT. IEEE, 2018.
+
diff --git a/egs/lrs/avsr1/RESULTS.md b/egs/lrs/avsr1/RESULTS.md
new file mode 100755
index 00000000000..2615db795f8
--- /dev/null
+++ b/egs/lrs/avsr1/RESULTS.md
@@ -0,0 +1,294 @@
+## pretrain_Train_pytorch_audio_delta_specaug (Audio-Only)
+
+* Model files (archived to model.tar.gz by <code>$ pack_model.sh</code>)
+  - download link: <code>https://drive.google.com/file/d/1ITgdZoa8vQ7lDwi1jLziYGXOyUtgE2ow/view</code>
+  - training config file: <code>conf/train.yaml</code>
+  - decoding config file: <code>conf/decode.yaml</code>
+  - preprocess config file: <code>conf/specaug.yaml</code>
+  - lm config file: <code>conf/lm.yaml</code> 
+  - cmvn file: <code>data/train/cmvn.ark</code>
+  - e2e file: <code>exp/audio/model.last10.avg.best</code>
+  - e2e json file: <code>exp/audio/model.json</code>
+  - lm file: <code>exp/train_rnnlm_pytorch_lm_unigram500/rnnlm.model.best</code>
+  - lm JSON file: <code>exp/train_rnnlm_pytorch_lm_unigram500/model.json</code>
+  - dict file: <code>data/lang_char/train_unigram500_units.txt</code>
+
+## Environments
+- date: `Mon Feb 21 11:52:07 UTC 2022`
+- python version: `3.7.3 (default, Mar 27 2019, 22:11:17)  [GCC 7.3.0]`
+- espnet version: `espnet 0.6.0`
+- chainer version: `chainer 6.0.0`
+- pytorch version: `pytorch 1.0.1.post2`
+
+### CER
+
+|dataset|SNR in dB|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|---|
+|music noise|-12|171|1669|82.0|11.2|6.8|2.2|20.3|38.6|
+||-9|187|1897|87.0|8.3|4.7|0.8|13.8|33.2|
+||-6|176|1821|92.0|5.5|2.5|1.1|9.1|26.7|
+||-3|201|2096|94.4|2.2|3.3|0.2|5.8|20.4|
+||0|158|1611|95.0|3.0|2.0|0.4|5.4|19.0|
+||3|173|1710|94.7|2.7|2.6|0.4|5.7|24.9|
+||6|185|1920|96.2|1.8|2.0|0.5|4.3|17.8|
+||9|157|1533|97.6|1.0|1.4|0.5|2.9|13.4|
+||12|150|1536|96.4|1.6|2.1|0.3|4.0|20.7|
+||clean|138|1390|96.7|1.4|1.9|0.4|3.7|17.4|
+||reverb|177|1755|93.7|3.6|2.7|0.7|7.0|23.2|
+|ambient noise|-12|187|1873|76.4|16.3|7.3|2.3|25.9|51.9|
+||-9 |193|1965|84.2|10.3|5.4|1.8|17.6|40.4|
+||-6 |176|1883|90.2|5.8|4.0|1.3|11.2|26.1|
+||-3 |173|1851|91.2|4.8|4.0|1.0|9.8|32.9|
+|| 0 |148|1470|94.8|3.0|2.2|0.7|5.9|23.6|
+|| 3 |176|1718|96.0|2.1|1.9|0.3|4.3|17.0|
+|| 6 |166|1714|93.7|2.9|3.4|0.5|6.8|20.5|
+|| 9 |170|1601|96.9|1.5|1.6|0.3|3.4|18.2|
+||12 |169|1718|95.9|2.5|1.6|0.2|4.3|20.1|
+||clean |138|1390|96.7|1.4|1.9|0.4|3.7|17.4|
+||reverb |177|1755|93.7|3.6|2.7|0.7|7.0|23.2|
+
+### WER
+
+|dataset|SNR in dB|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|---|
+|music noise|-12|171|912|83.4|12.5|4.1|2.4|19.0|38.6|
+||-9 |187|1005|87.6|8.6|3.9|1.9|14.3|33.2|
+||-6 |176|951|90.6|5.9|3.5|0.8|10.2|26.7|
+||-3 |201|1097|94.4|3.3|2.3|0.6|6.2|20.4|
+|| 0 |158|847|94.9|3.2|1.9|0.4|5.4|19.0|
+|| 3 |173|884|94.2|3.8|1.9|0.6|6.3|24.9|
+|| 6 |185|997|96.3|2.7|1.0|0.7|4.4|17.8|
+|| 9 |157|817|96.9|1.7|1.3|0.4|3.4|13.4|
+||12 |150|832|95.2|2.9|1.9|0.5|5.3|20.7|
+||clean |138|739|95.7|2.4|1.9|0.4|4.7|17.4|
+||reverb |177|943|93.6|4.0|2.3|0.4|6.8|23.2|
+|ambient noise|-12|187|995|73.7|18.4|7.9|1.7|28.0|51.9|
+||-9 |193|1060|83.0|11.7|5.3|1.4|18.4|40.4|
+||-6 |176|971|90.2|6.8|3.0|1.4|11.2|26.1|
+||-3 |173|972|90.0|6.9|3.1|1.0|11.0|32.9|
+|| 0 |148|838|94.0|4.1|1.9|0.4|6.3|23.6|
+|| 3 |176|909|95.5|2.9|1.7|0.3|4.8|17.0|
+|| 6 |166|830|94.1|3.3|2.7|1.0|6.9|20.5|
+|| 9 |170|872|95.4|3.1|1.5|0.2|4.8|18.2|
+||12 |169|895|95.0|4.0|1.0|0.2|5.3|20.1|
+||clean |138|739|95.7|2.4|1.9|0.4|4.7|17.4|
+||reverb |177|943|93.6|4.0|2.3|0.4|6.8|23.2|
+
+## Train_pytorch_trainvideo_delta_specaug (Video-Only)
+
+* Model files (archived to model.tar.gz by <code>$ pack_model.sh</code>)
+  - download link: <code>https://drive.google.com/file/d/1ZXXCXSbbFS2PDlrs9kbJL9pE6-5nPPxi/view</code>
+  - training config file: <code>conf/finetunevideo/trainvideo.yaml</code>
+  - decoding config file: <code>conf/decode.yaml</code>
+  - preprocess config file: <code>conf/specaug.yaml</code>
+  - lm config file: <code>conf/lm.yaml</code> 
+  - e2e file: <code>exp/vfintune/model.last10.avg.best</code>
+  - e2e json file: <code>exp/vfintune/model.json</code>
+  - lm file: <code>exp/train_rnnlm_pytorch_lm_unigram500/rnnlm.model.best</code>
+  - lm JSON file: <code>exp/train_rnnlm_pytorch_lm_unigram500/model.json</code>
+  - dict file: <code>data/lang_char/train_unigram500_units.txt</code>
+
+## Environments
+- date: `Mon Feb 21 11:52:07 UTC 2022`
+- python version: `3.7.3 (default, Mar 27 2019, 22:11:17)  [GCC 7.3.0]`
+- espnet version: `espnet 0.6.0`
+- chainer version: `chainer 6.0.0`
+- pytorch version: `pytorch 1.0.1.post2`
+
+
+### CER
+
+|dataset|SNR in dB|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|---|
+|clean visual data|171|1669|42.3|42.5|15.2|6.4|64.1|91.8|
+||-9 |187|1897|46.4|38.8|14.8|8.5|62.2|90.9|
+||-6 |176|1821|48.1|37.7|14.2|9.2|61.1|92.0|
+||-3 |201|2096|41.7|46.4|11.9|8.9|67.2|90.0|
+|| 0 |158|1611|43.4|42.6|14.0|7.1|63.7|94.9|
+|| 3 |173|1710|49.2|37.6|13.2|8.9|59.7|91.9|
+|| 6 |185|1920|39.3|45.6|15.2|9.4|70.2|95.1|
+|| 9 |157|1533|46.2|39.1|14.7|8.5|62.3|89.2|
+||12 |150|1536|49.5|37.6|12.9|7.2|57.7|87.3|
+||clean |138|1390|44.2|42.3|13.5|7.8|63.7|92.8|
+||reverb |177|1755|44.8|41.5|13.6|7.5|62.7|92.1|
+|visual gaussian blur|-12|187|1873|37.3|46.6|16.1|9.0|71.6|93.0|
+||-9 |193|1965|43.0|44.1|13.0|11.0|68.1|93.8|
+||-6 |176|1883|39.9|43.3|16.7|7.5|67.6|93.8|
+||-3 |173|1851|43.7|43.8|12.5|8.2|64.5|91.9|
+|| 0 |148|1470|42.3|45.4|12.3|8.2|65.9|93.9|
+|| 3 |176|1718|44.8|41.5|13.7|7.9|63.1|89.2|
+|| 6 |166|1714|38.5|45.4|16.0|10.7|72.2|94.6|
+|| 9 |170|1601|45.1|42.8|12.1|11.7|66.6|91.2|
+||12 |169|1718|42.0|40.1|17.9|8.2|66.2|92.3|
+||clean |138|1390|40.4|45.5|14.2|8.7|68.3|93.5|
+||reverb |177|1755|40.2|45.6|14.2|8.5|68.3|92.7|
+|visual salt and pepper noise|-12|187|1873|36.2|48.1|15.8|9.9|73.7|92.0|
+||-9 |193|1965|41.7|44.6|13.7|10.6|68.9|92.7|
+||-6 |176|1883|36.5|47.2|16.4|8.6|72.1|93.2|
+||-3 |173|1851|42.1|45.4|12.5|10.8|68.6|92.5|
+|| 0 |148|1470|42.3|45.1|12.6|9.5|67.2|91.9|
+|| 3 |176|1718|40.0|45.1|15.0|7.6|67.6|92.0|
+|| 6 |166|1714|38.1|45.2|16.7|10.1|72.0|94.0|
+|| 9 |170|1601|40.2|45.9|13.9|12.0|71.8|92.9|
+||12 |169|1718|37.5|46.8|15.7|8.7|71.2|94.1|
+||clean |138|1390|39.9|46.0|14.0|9.1|69.1|92.8|
+||reverb |177|1755|39.9|46.2|13.9|9.1|69.2|92.7|
+
+### WER
+
+|dataset|SNR in dB|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|---|
+|clean visual data|-12|171|912|39.4|42.7|18.0|4.3|64.9|89.5|
+||-9 |187|1005|43.7|40.6|15.7|5.4|61.7|86.1|
+||-6 |176|951|43.3|42.6|14.1|4.1|60.8|88.6|
+||-3 |201|1097|41.3|44.2|14.5|5.3|64.0|85.6|
+|| 0 |158|847|44.3|37.8|17.9|6.1|61.9|85.4|
+|| 3 |173|884|44.2|39.7|16.1|5.3|61.1|84.4|
+|| 6 |185|997|38.2|44.8|17.0|3.9|65.7|84.9|
+|| 9 |157|817|47.9|37.1|15.1|5.5|57.6|80.3|
+||12 |150|832|42.9|37.6|19.5|5.3|62.4|84.0|
+||clean |138|739|45.9|39.1|15.0|5.3|59.4|85.5|
+||reverb |177|943|43.4|40.5|16.1|5.3|61.9|85.9|
+|visual Gaussian blur|-12|187|995|35.9|45.4|18.7|5.3|69.4|86.6|
+||-9 |193|1060|35.0|44.2|20.8|5.0|70.0|92.2|
+||-6 |176|971|38.2|43.2|18.6|4.6|66.4|87.5|
+||-3 |173|972|37.9|45.5|16.7|4.8|67.0|86.1|
+|| 0 |148|838|38.1|40.7|21.2|4.2|66.1|89.2|
+|| 3 |176|909|36.0|48.5|15.5|5.9|70.0|88.6|
+|| 6 |166|830|36.7|46.6|16.6|6.1|69.4|89.8|
+|| 9 |170|872|39.0|45.5|15.5|4.7|65.7|87.6|
+||12 |169|895|35.2|46.8|18.0|4.6|69.4|89.9|
+||clean |138|739|40.7|42.2|17.1|5.0|64.3|88.4|
+||reverb |177|943|38.0|44.3|17.7|5.0|67.0|89.3|
+|visual salt and pepper noise|-12|187|995|32.5|48.9|18.6|4.6|72.2|83.4|
+||-9 |193|1060|32.3|51.5|16.2|6.1|73.9|92.2|
+||-6 |176|971|36.5|47.3|16.3|7.2|70.8|86.4|
+||-3 |173|972|35.5|47.2|17.3|4.6|69.1|88.4|
+|| 0 |148|838|36.9|41.5|21.6|3.7|66.8|88.5|
+|| 3 |176|909|33.0|51.9|15.1|5.4|72.4|88.6|
+|| 6 |166|830|35.3|49.9|14.8|8.8|73.5|88.0|
+|| 9 |170|872|41.2|43.3|15.5|5.6|64.4|84.7|
+||12 |169|895|34.2|47.8|18.0|7.3|73.1|91.1|
+||clean |138|739|37.5|47.8|14.7|7.3|69.8|86.2|
+||reverb |177|943|35.9|47.9|16.1|6.7|70.7|87.0|
+
+## Train_pytorch_trainavs_delta_specaug (Audio-Visual)
+
+* Model files (archived to model.tar.gz by <code>$ pack_model.sh</code>)
+  - download link: <code>https://drive.google.com/file/d/1ZXXCXSbbFS2PDlrs9kbJL9pE6-5nPPxi/view</code>
+  - training config file: <code>conf/finetuneav/trainavs.yaml</code>
+  - decoding config file: <code>conf/decode.yaml</code>
+  - preprocess config file: <code>conf/specaug.yaml</code>
+  - lm config file: <code>conf/lm.yaml</code> 
+  - cmvn file: <code>data/train/cmvn.ark</code>
+  - e2e file: <code>exp/avfintune/model.last10.avg.best</code>
+  - e2e json file: <code>exp/avfintune/model.json</code>
+  - lm file: <code>exp/train_rnnlm_pytorch_lm_unigram500/rnnlm.model.best</code>
+  - lm JSON file: <code>exp/train_rnnlm_pytorch_lm_unigram500/model.json</code>
+  - dict file: <code>data/lang_char/train_unigram500_units.txt</code>
+
+## Environments
+- date: `Mon Feb 21 11:52:07 UTC 2022`
+- python version: `3.7.3 (default, Mar 27 2019, 22:11:17)  [GCC 7.3.0]`
+- espnet version: `espnet 0.6.0`
+- chainer version: `chainer 6.0.0`
+- pytorch version: `pytorch 1.0.1.post2`
+
+
+### CER
+
+|dataset|SNR in dB|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|---|
+|music noise with clean visual data |-12|171|1669|90.7|5.4|3.9|0.7|9.9|26.3|
+||-9 |187|1897|93.7|3.5|2.7|0.4|6.7|25.1|
+||-6 |176|1821|95.1|2.9|2.0|0.4|5.4|18.8|
+||-3 |201|2096|96.2|1.6|2.2|0.3|4.2|15.9|
+|| 0 |158|1611|96.4|1.9|1.7|0.2|3.8|13.9|
+|| 3 |173|1710|96.7|1.7|1.6|0.2|3.6|17.9|
+|| 6 |185|1920|96.1|1.6|2.2|0.5|4.3|18.9|
+|| 9 |157|1533|96.9|1.4|1.7|0.5|3.6|14.0|
+||12 |150|1536|96.5|1.4|2.1|0.5|4.0|21.3|
+||clean |138|1390|97.9|0.9|1.2|0.2|2.3|13.8|
+||reverb |177|1755|96.8|1.5|1.8|0.2|3.5|16.4|
+|ambient noise with clean visual data |-12|187|1873|89.6|5.8|4.6|1.2|11.5|31.0|
+||-9 |193|1965|91.2|5.0|3.8|0.9|9.6|29.0|
+||-6 |176|1883|94.3|1.9|3.8|0.3|6.0|21.0|
+||-3 |173|1851|94.8|2.7|2.5|0.9|6.1|22.0|
+|| 0 |148|1470|96.3|1.6|2.0|0.1|3.8|16.9|
+|| 3 |176|1718|97.7|1.5|0.8|0.1|2.4|12.5|
+|| 6 |166|1714|96.6|1.6|1.8|0.2|3.6|16.3|
+|| 9 |170|1601|97.0|1.6|1.4|0.3|3.3|17.1|
+||12 |169|1718|95.4|2.6|2.0|0.1|4.7|20.7|
+||clean |138|1390|97.9|0.9|1.2|0.2|2.3|13.8|
+||reverb |177|1755|96.8|1.5|1.8|0.2|3.5|16.4|
+|ambient noise with visual Gaussian blur|-12|187|1873|86.9|7.3|5.8|1.1|14.2|35.8|
+||-9 |193|1965|91.1|5.4|3.5|1.0|9.9|30.1|
+||-6 |176|1883|93.3|2.7|4.0|0.3|7.0|24.4|
+||-3 |173|1851|95.1|2.5|2.4|0.8|5.7|21.4|
+|| 0 |148|1470|96.3|1.6|2.1|0.1|3.8|17.6|
+|| 3 |176|1718|97.3|1.6|1.2|0.2|2.9|13.6|
+|| 6 |166|1714|96.2|1.8|2.0|0.2|4.0|18.1|
+|| 9 |170|1601|97.0|1.4|1.6|0.2|3.2|16.5|
+||12 |169|1718|94.9|2.8|2.3|0.3|5.4|23.1|
+||clean |138|1390|97.8|0.9|1.3|0.2|2.4|14.5|
+||reverb |177|1755|96.5|1.5|2.1|0.2|3.7|16.9|
+|ambient noise with visual salt and pepper noise|-12|187|1873|87.6|7.0|5.4|1.3|13.8|35.8|
+||-9 |193|1965|91.0|5.8|3.2|1.3|10.3|30.6|
+||-6 |176|1883|93.6|2.0|4.4|0.4|6.9|24.4|
+||-3 |173|1851|95.6|2.9|1.6|0.8|5.2|20.2|
+|| 0 |148|1470|95.9|1.9|2.2|0.1|4.2|18.2|
+|| 3 |176|1718|98.0|1.0|1.0|0.3|2.3|13.1|
+|| 6 |166|1714|96.4|1.8|1.8|0.2|3.7|17.5|
+|| 9 |170|1601|97.0|1.4|1.6|0.4|3.4|16.5|
+||12 |169|1718|96.2|2.2|1.6|0.2|4.1|18.9|
+||clean |138|1390|98.1|0.9|1.1|0.2|2.2|13.0|
+||reverb |177|1755|96.6|1.5|1.9|0.2|3.6|16.9|
+
+### WER
+
+|dataset|SNR in dB|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|---|
+|music noise with clean visual data |-12|171|912|91.2|6.0|2.7|1.5|10.3|26.3|
+||-9 |187|1005|93.2|4.5|2.3|0.4|7.2|25.1|
+||-6 |176|951|94.1|3.7|2.2|0.3|6.2|18.8|
+||-3 |201|1097|95.2|2.7|2.1|0.4|5.2|15.9|
+|| 0 |158|847|96.7|2.2|1.1|0.4|3.7|13.9|
+|| 3 |173|884|95.6|2.6|1.8|0.3|4.8|17.9|
+|| 6 |185|997|95.5|2.3|2.2|0.7|5.2|18.9|
+|| 9 |157|817|96.2|2.1|1.7|0.7|4.5|14.0|
+||12 |150|832|95.1|2.4|2.5|0.2|5.2|21.3|
+||clean |138|739|97.2|1.5|1.4|0.4|3.2|13.8|
+||reverb |177|943|96.0|1.8|2.2|0.3|4.3|16.4|
+|ambient noise with clean visual data |-12|187|995|90.4|6.9|2.7|1.1|10.8|31.0|
+||-9 |193|1060|91.3|5.6|3.1|1.4|10.1|29.0|
+||-6 |176|971|94.4|2.9|2.7|0.3|5.9|21.0|
+||-3 |173|972|93.7|3.7|2.6|0.1|6.4|22.0|
+|| 0 |148|838|95.7|2.0|2.3|0.1|4.4|16.9|
+|| 3 |176|909|97.0|1.5|1.4|0.3|3.3|12.5|
+|| 6 |166|830|96.0|1.9|2.0|0.6|4.6|16.3|
+|| 9 |170|872|95.6|3.4|0.9|0.2|4.6|17.1|
+||12 |169|895|94.0|3.7|2.3|0.4|6.5|20.7|
+||clean |138|739|97.2|1.5|1.4|0.4|3.2|13.8|
+||reverb |177|943|96.0|1.8|2.2|0.3|4.3|16.4|
+|ambient noise with visual Gaussian blur|-12|187|995|87.0|9.1|3.8|1.0|14.0|35.8|
+||-9 |193|1060|90.6|6.2|3.2|1.1|10.6|30.1|
+||-6 |176|971|93.2|3.6|3.2|0.3|7.1|24.4|
+||-3 |173|972|94.0|3.6|2.4|0.1|6.1|21.4|
+|| 0 |148|838|95.6|2.3|2.1|0.2|4.7|17.6|
+|| 3 |176|909|96.3|1.7|2.1|0.3|4.1|13.6|
+|| 6 |166|830|95.4|2.3|2.3|0.6|5.2|18.1|
+|| 9 |170|872|95.6|3.1|1.3|0.2|4.6|16.5|
+||12 |169|895|93.2|4.4|2.5|0.4|7.3|23.1|
+||clean |138|739|97.0|1.5|1.5|0.4|3.4|14.5|
+||reverb |177|943|95.7|1.7|2.7|0.3|4.7|16.9|
+|ambient noise with visual salt and pepper noise|-12|187|995|87.1|8.8|4.0|0.9|13.8|35.8|
+||-9 |193|1060|90.5|6.3|3.2|1.1|10.7|30.6|
+||-6 |176|971|93.3|3.2|3.5|0.3|7.0|24.4|
+||-3 |173|972|94.7|3.8|1.5|0.2|5.6|20.2|
+|| 0 |148|838|95.3|2.4|2.3|0.2|4.9|18.2|
+|| 3 |176|909|96.8|1.4|1.8|0.3|3.5|13.1|
+|| 6 |166|830|95.9|2.2|1.9|0.7|4.8|17.5|
+|| 9 |170|872|95.6|3.1|1.3|0.2|4.6|16.5|
+||12 |169|895|94.7|3.5|1.8|0.3|5.6|18.9|
+||clean |138|739|97.4|1.5|1.1|0.4|3.0|13.0|
+||average |177|943|95.8|1.9|2.3|0.4|4.7|16.9|
diff --git a/egs/lrs/avsr1/cmd.sh b/egs/lrs/avsr1/cmd.sh
new file mode 100755
index 00000000000..4d70c9c7a79
--- /dev/null
+++ b/egs/lrs/avsr1/cmd.sh
@@ -0,0 +1,89 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/gpu.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs/lrs/avsr1/conf/decode.yaml b/egs/lrs/avsr1/conf/decode.yaml
new file mode 100755
index 00000000000..98b36d1752e
--- /dev/null
+++ b/egs/lrs/avsr1/conf/decode.yaml
@@ -0,0 +1,7 @@
+batchsize: 0
+beam-size: 60
+ctc-weight: 0.4
+lm-weight: 0.6
+maxlenratio: 0.0
+minlenratio: 0.0
+penalty: 0.0
diff --git a/egs/lrs/avsr1/conf/fbank.conf b/egs/lrs/avsr1/conf/fbank.conf
new file mode 100755
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs/lrs/avsr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs/lrs/avsr1/conf/gpu.conf b/egs/lrs/avsr1/conf/gpu.conf
new file mode 100755
index 00000000000..6d0a75b067a
--- /dev/null
+++ b/egs/lrs/avsr1/conf/gpu.conf
@@ -0,0 +1,10 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0
+option gpu=* -l 'hostname=b1[12345678]*|c*,gpu=$0' -q g.q
\ No newline at end of file
diff --git a/egs/lrs/avsr1/conf/lm.yaml b/egs/lrs/avsr1/conf/lm.yaml
new file mode 100755
index 00000000000..94918a470ae
--- /dev/null
+++ b/egs/lrs/avsr1/conf/lm.yaml
@@ -0,0 +1,9 @@
+layer: 4
+dropout: 0
+unit: 2048
+opt: sgd        # or adam
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+batchsize: 128  # batch size in LM training
+epoch: 2      # if the data size is large, we can reduce this
+patience: 3
+maxlen: 150     # if sentence length > lm_maxlen, lm_batchsize is automatically reduced
diff --git a/egs/lrs/avsr1/conf/mfcc.conf b/egs/lrs/avsr1/conf/mfcc.conf
new file mode 100755
index 00000000000..a1aa3d6c158
--- /dev/null
+++ b/egs/lrs/avsr1/conf/mfcc.conf
@@ -0,0 +1,2 @@
+--use-energy=false   # only non-default option.
+--sample-frequency=16000
diff --git a/egs/lrs/avsr1/conf/mfcc_hires.conf b/egs/lrs/avsr1/conf/mfcc_hires.conf
new file mode 100755
index 00000000000..434834a6725
--- /dev/null
+++ b/egs/lrs/avsr1/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why 
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=40     # there is no dimensionality reduction.
+--low-freq=20     # low cutoff frequency for mel bins... this is high-bandwidth data, so
+                  # there might be some information at the low end.
+--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) 
diff --git a/egs/lrs/avsr1/conf/pitch.conf b/egs/lrs/avsr1/conf/pitch.conf
new file mode 100755
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs/lrs/avsr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs/lrs/avsr1/conf/queue.conf b/egs/lrs/avsr1/conf/queue.conf
new file mode 100755
index 00000000000..257d7b7b3aa
--- /dev/null
+++ b/egs/lrs/avsr1/conf/queue.conf
@@ -0,0 +1,10 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs/lrs/avsr1/conf/slurm.conf b/egs/lrs/avsr1/conf/slurm.conf
new file mode 100755
index 00000000000..cefd21f031d
--- /dev/null
+++ b/egs/lrs/avsr1/conf/slurm.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command sbatch --export=PATH  --ntasks-per-node=1
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
+option num_threads=1 --cpus-per-task 1  --ntasks-per-node=1 # Do not add anything to qsub_opts
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs/lrs/avsr1/conf/specaug.yaml b/egs/lrs/avsr1/conf/specaug.yaml
new file mode 100755
index 00000000000..c0643d38597
--- /dev/null
+++ b/egs/lrs/avsr1/conf/specaug.yaml
@@ -0,0 +1,16 @@
+process:
+  # these three processes are a.k.a. SpecAugument
+  - type: "time_warp"
+    max_time_warp: 5
+    inplace: true
+    mode: "PIL"
+  - type: "freq_mask"
+    F: 30
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+  - type: "time_mask"
+    T: 40
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
diff --git a/egs/lrs/avsr1/conf/train.yaml b/egs/lrs/avsr1/conf/train.yaml
new file mode 100755
index 00000000000..53fd0572132
--- /dev/null
+++ b/egs/lrs/avsr1/conf/train.yaml
@@ -0,0 +1,39 @@
+# network architecture
+# encoder related
+transformer-input-layer: conv2d
+elayers: 12
+eunits: 2048
+# decoder related
+dlayers: 6
+dunits: 2048
+# attention related
+adim: 256
+aheads: 4
+# transformer related
+model-module: "espnet.trainaudio.e2e_asr_transformer:E2E"
+
+# hybrid CTC/attention
+mtlalpha: 0.3
+
+# label smoothing
+lsm-type: unigram
+lsm-weight: 0.1
+
+# minibatch related
+batch-size: 32
+maxlen-in: 512  # if input length  > maxlen_in, batchsize is automatically reduced
+maxlen-out: 150 # if output length > maxlen_out, batchsize is automatically reduced
+
+# optimization related
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+opt: noam
+epochs: 100
+dropout-rate: 0.1
+accum-grad: 2
+grad-clip: 5
+patience: 0
+transformer-lr: 5.0
+transformer-warmup-steps: 25000
+transformer-attn-dropout-rate: 0.0
+transformer-length-normalized-loss: False
+transformer-init: pytorch
diff --git a/egs/lrs/avsr1/local/CMakeLists.txt b/egs/lrs/avsr1/local/CMakeLists.txt
new file mode 100644
index 00000000000..107f5c1e76d
--- /dev/null
+++ b/egs/lrs/avsr1/local/CMakeLists.txt
@@ -0,0 +1,248 @@
+cmake_minimum_required (VERSION 3.8)
+set(CMAKE_CXX_STANDARD 17)
+
+project(OpenFace VERSION 2.0.2)
+
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin/)
+
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/modules/")
+
+set(CMAKE_CONFIG_DIR etc/OpenFace)
+set(CONFIG_DIR "${CMAKE_INSTALL_PREFIX}/${CMAKE_CONFIG_DIR}")
+add_definitions(-DCONFIG_DIR="${CONFIG_DIR}")
+
+# make sure we'll use OpenBLAS only: there's a header file naming difference between different
+# implementations; so OpenFace wants OpenBLAS;
+find_package(Threads)
+find_package(OpenBLAS REQUIRED)
+if ( ${OpenBLAS_FOUND} )
+    MESSAGE("OpenBLAS information:")
+    MESSAGE("  OpenBLAS_LIBRARIES: ${OpenBLAS_LIB}")
+else()
+    MESSAGE(FATAL_ERROR "OpenBLAS not found in the system.")
+endif()
+
+if ( ${OpenBLAS_INCLUDE_FOUND} )
+    MESSAGE("  OpenBLAS_INCLUDE: ${OpenBLAS_INCLUDE_DIR}")
+else()
+    MESSAGE(WARNING "OpenBLAS include not found in the system. Using the one vended with OpenFace.")
+	set(OpenBLAS_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/lib/3rdParty/OpenBLAS/include")
+    MESSAGE("  OpenBLAS_INCLUDE: ${OpenBLAS_INCLUDE_DIR}")
+endif()
+
+find_package( OpenCV 4.0 REQUIRED COMPONENTS core imgproc calib3d highgui objdetect)
+if(${OpenCV_FOUND})
+	MESSAGE("OpenCV information:") 
+	MESSAGE("  OpenCV_INCLUDE_DIRS: ${OpenCV_INCLUDE_DIRS}") 
+	MESSAGE("  OpenCV_LIBRARIES: ${OpenCV_LIBRARIES}") 
+	MESSAGE("  OpenCV_LIBRARY_DIRS: ${OpenCV_LINK_DIRECTORIES}") 
+else()
+    MESSAGE(FATAL_ERROR "OpenCV not found in the system.")
+endif()
+
+find_package( Boost 1.5.9 COMPONENTS filesystem system)
+if(${Boost_FOUND})
+	MESSAGE("Boost information:") 
+	MESSAGE("  Boost_VERSION: ${Boost_VERSION}")
+	MESSAGE("  Boost_INCLUDE_DIRS: ${Boost_INCLUDE_DIRS}") 
+	MESSAGE("  Boost_LIBRARIES: ${Boost_LIBRARIES}") 
+	MESSAGE("  Boost_LIBRARY_DIRS: ${Boost_LIBRARY_DIRS}") 
+else()
+    MESSAGE("Boost not found in the system.")
+endif()
+
+
+# Move LandmarkDetector model
+file(GLOB files "lib/local/LandmarkDetector/model/*.txt")
+foreach(file ${files})
+	file(COPY ${file} DESTINATION ${CMAKE_BINARY_DIR}/bin/model)
+	install(FILES ${file} DESTINATION ${CMAKE_CONFIG_DIR}/model)
+endforeach()
+
+# Move the hierarchical LandmarkDetector models
+file(GLOB files "lib/local/LandmarkDetector/model/model*")
+foreach(file ${files})
+	file(COPY ${file} DESTINATION ${CMAKE_BINARY_DIR}/bin/model)
+	install(DIRECTORY ${file} DESTINATION ${CMAKE_CONFIG_DIR}/model)
+endforeach()
+
+# Move detection validation models
+file(GLOB files "lib/local/LandmarkDetector/model/detection_validation/*.txt")
+foreach(file ${files})
+	file(COPY ${file} DESTINATION ${CMAKE_BINARY_DIR}/bin/model/detection_validation)
+	install(FILES ${file} DESTINATION ${CMAKE_CONFIG_DIR}/model/detection_validation)
+endforeach()
+
+# Move patch experts
+file(GLOB files "lib/local/LandmarkDetector/model/patch_experts/*.txt")
+foreach(file ${files})
+	file(COPY ${file} DESTINATION ${CMAKE_BINARY_DIR}/bin/model/patch_experts)
+	install(FILES ${file} DESTINATION ${CMAKE_CONFIG_DIR}/model/patch_experts)
+endforeach()
+
+# Move CEN patch experts
+file(GLOB files "lib/local/LandmarkDetector/model/patch_experts/*.dat")
+foreach(file ${files})
+	file(COPY ${file} DESTINATION ${CMAKE_BINARY_DIR}/bin/model/patch_experts)
+	install(FILES ${file} DESTINATION ${CMAKE_CONFIG_DIR}/model/patch_experts)
+endforeach()
+
+# Move MTCNN face detector
+file(GLOB files "lib/local/LandmarkDetector/model/mtcnn_detector/*.txt")
+foreach(file ${files})
+	file(COPY ${file} DESTINATION ${CMAKE_BINARY_DIR}/bin/model/mtcnn_detector)
+	install(FILES ${file} DESTINATION ${CMAKE_CONFIG_DIR}/model/mtcnn_detector)
+endforeach()
+
+# Move MTCNN face detector
+file(GLOB files "lib/local/LandmarkDetector/model/mtcnn_detector/*.dat")
+foreach(file ${files})
+	file(COPY ${file} DESTINATION ${CMAKE_BINARY_DIR}/bin/model/mtcnn_detector)
+	install(FILES ${file} DESTINATION ${CMAKE_CONFIG_DIR}/model/mtcnn_detector)
+endforeach()
+
+# Move Point Distribution models
+file(GLOB files "lib/local/LandmarkDetector/model/pdms/*.txt")
+foreach(file ${files})
+	file(COPY ${file} DESTINATION ${CMAKE_BINARY_DIR}/bin/model/pdms)
+	install(FILES ${file} DESTINATION ${CMAKE_CONFIG_DIR}/model/pdms)
+endforeach()
+
+# Move OpenCV classifiers
+file(GLOB files "lib/3rdParty/OpenCV3.4/classifiers/*.xml")
+foreach(file ${files})
+	file(COPY ${file} DESTINATION ${CMAKE_BINARY_DIR}/bin/classifiers)
+	install(FILES ${file} DESTINATION ${CMAKE_CONFIG_DIR}/classifiers)
+endforeach()
+
+# Move AU prediction modules
+file(GLOB files "lib/local/FaceAnalyser/AU_predictors/*.txt")
+foreach(file ${files})
+	file(COPY ${file} DESTINATION ${CMAKE_BINARY_DIR}/bin/AU_predictors)
+	install(FILES ${file} DESTINATION ${CMAKE_CONFIG_DIR}/AU_predictors)
+endforeach()
+
+# Move AU prediction modules
+file(GLOB files "lib/local/FaceAnalyser/AU_predictors/svr*")
+foreach(file ${files})
+	file(COPY ${file} DESTINATION ${CMAKE_BINARY_DIR}/bin/AU_predictors)
+	install(DIRECTORY ${file} DESTINATION ${CMAKE_CONFIG_DIR}/AU_predictors)
+endforeach()
+
+# Move AU prediction modules
+file(GLOB files "lib/local/FaceAnalyser/AU_predictors/svm*")
+foreach(file ${files})
+	file(COPY ${file} DESTINATION ${CMAKE_BINARY_DIR}/bin/AU_predictors)
+	install(DIRECTORY ${file} DESTINATION ${CMAKE_CONFIG_DIR}/AU_predictors)
+endforeach()
+
+if (${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU")
+    execute_process(COMMAND ${CMAKE_CXX_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
+    if (GCC_VERSION VERSION_LESS 8.0)
+		MESSAGE(FATAL_ERROR "Need a 8.0 or newer GCC compiler. Current GCC: ${GCC_VERSION}")
+    else ()
+        set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse -msse2 -msse3")
+    endif ()
+endif ()
+
+# dlib
+find_package(dlib 19.13)
+if(${dlib_FOUND})
+    message("dlib information:")
+    message("  dlib version: ${dlib_VERSION}")
+
+    if (NOT TARGET dlib)
+        add_library(dlib INTERFACE IMPORTED GLOBAL)
+    endif()
+else()
+    message(FATAL_ERROR "dlib not found in the system, please install dlib")
+endif()
+
+# suppress auto_ptr deprecation warnings
+if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
+    add_compile_options("-Wno-deprecated-declarations")
+endif()
+
+# LandmarkDetector library
+add_subdirectory(lib/local/LandmarkDetector)
+# Facial Expression analysis library
+add_subdirectory(lib/local/FaceAnalyser)
+# Gaze estimation library
+add_subdirectory(lib/local/GazeAnalyser)
+# Utilities library
+add_subdirectory(lib/local/Utilities)
+
+# test if this file is a top list file
+# thus we're building an OpenFace as a standalone
+# project; otherwise OpenFace is being built as a
+# part or larger tree;
+if(CMAKE_CURRENT_SOURCE_DIR STREQUAL "${CMAKE_SOURCE_DIR}")
+
+    # for a standalone builds - allow installing package configs;
+    message(STATUS "Standalone mode detected; Enabling configuration/targets export.")
+
+    # export libraries for reuse
+    include(CMakePackageConfigHelpers)
+
+    set(LIB_INSTALL_DIR lib)
+    set(CONFIG_DEST_DIR ${LIB_INSTALL_DIR}/cmake/OpenFace/)
+    set(OpenFace_LIBRARIES OpenFace::GazeAnalyser OpenFace::FaceAnalyser OpenFace::LandmarkDetector OpenFace::Utilities)
+
+    # export targets [build tree]
+    export(EXPORT OpenFaceTargets
+        NAMESPACE OpenFace::
+        FILE "${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_DEST_DIR}/OpenFaceTargets.cmake")
+
+    # write package version file
+    write_basic_package_version_file(
+        "${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_DEST_DIR}/OpenFaceConfigVersion.cmake"
+        COMPATIBILITY AnyNewerVersion)
+
+    # define [build tree] bindir relative include dir
+    foreach(lib ${OpenFace_LIBRARIES})
+      if(TARGET ${lib})
+        get_target_property(libname ${lib} "NAME")
+        file(RELATIVE_PATH rel_incdir ${CMAKE_CURRENT_BINARY_DIR} "${CMAKE_CURRENT_SOURCE_DIR}/lib/local/${libname}/include")
+        list(APPEND OPENFACE_INCLUDE_DIRS ${rel_incdir})
+      endif()
+    endforeach()
+    list(REMOVE_DUPLICATES OPENFACE_INCLUDE_DIRS)
+
+    # write package config file from template [build tree]
+    # all PATH_VARS should be relative to a ${CMAKE_CURRENT_BINARY_DIR}
+    # as it's the "prefix" of our non installed package in the build tree
+    configure_package_config_file(cmake/OpenFaceConfig.cmake.in
+        "${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_DEST_DIR}/OpenFaceConfig.cmake"
+        INSTALL_DESTINATION ${CONFIG_DEST_DIR}
+        PATH_VARS OPENFACE_INCLUDE_DIRS)
+
+    # store current build dir in the CMake package registry
+    # export(PACKAGE OpenFace)
+
+    # install exported targets [install tree]
+    install(EXPORT OpenFaceTargets
+        FILE OpenFaceTargets.cmake
+        NAMESPACE OpenFace::
+        DESTINATION ${CONFIG_DEST_DIR})
+
+    # redefine [install tree] prefix relative include dir
+    set(OPENFACE_INCLUDE_DIRS "include/OpenFace")
+
+    # write package config file from template [install tree]
+    configure_package_config_file(cmake/OpenFaceConfig.cmake.in
+        "${CMAKE_CURRENT_BINARY_DIR}/OpenFace/OpenFaceConfig.cmake"
+        INSTALL_DESTINATION ${CONFIG_DEST_DIR}
+        PATH_VARS OPENFACE_INCLUDE_DIRS)
+
+    # install package configs
+    install(FILES
+        "${CMAKE_CURRENT_BINARY_DIR}/OpenFace/OpenFaceConfig.cmake"
+        "${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_DEST_DIR}/OpenFaceConfigVersion.cmake"
+        DESTINATION ${CONFIG_DEST_DIR})
+endif()
+
+# executables
+add_subdirectory(exe/FaceLandmarkImg)
+add_subdirectory(exe/FaceLandmarkVid)
+add_subdirectory(exe/FaceLandmarkVidMulti)
+add_subdirectory(exe/FeatureExtraction)
diff --git a/egs/lrs/avsr1/local/download.sh b/egs/lrs/avsr1/local/download.sh
new file mode 100644
index 00000000000..8d17609a9b2
--- /dev/null
+++ b/egs/lrs/avsr1/local/download.sh
@@ -0,0 +1,13 @@
+#! /usr/bin/env bash 
+
+# Copyright 2020 Ruhr-University (Wentao Yu)
+
+. ./cmd.sh
+. ./path.sh
+
+git clone https://github.com/rub-ksv/lrs_avsr1_local.git
+for file in data_prepare dump extract_reliability training; do
+	cp -R lrs_avsr1_local/$file local
+done
+rm -rf lrs_avsr1_local
+exit 0
diff --git a/egs/lrs/avsr1/local/installpackage.sh b/egs/lrs/avsr1/local/installpackage.sh
new file mode 100755
index 00000000000..066a396101a
--- /dev/null
+++ b/egs/lrs/avsr1/local/installpackage.sh
@@ -0,0 +1,82 @@
+#! /usr/bin/env bash 
+
+#! /usr/bin/env bash 
+
+# Copyright 2020 Ruhr-University (Wentao Yu)
+
+. ./cmd.sh
+. ./path.sh
+
+# hand over parameters 
+OPENFACE_DIR=$1			# Path to OpenFace build directory
+VIDAUG_DIR=$2 			# Path to vidaug directory
+DEEPXI_DIR=$3			# DeepXi directory
+
+conda install -n espnet_venv tensorflow tqdm pysoundfile boost
+conda install -n espnet_venv dlib pythran-openblas==0.3.6 opencv-python
+conda install -c esri tensorflow-addons
+
+mkdir -p local/installations
+if [ -d "$OPENFACE_DIR" ] ; then
+    echo "OpenFace already installed."
+else
+    while true
+    do
+        read -r -p "Have you already installed OpenFace on your computer [Y/n] " input
+        case $input in 
+	    [yY][eE][sS]|[yY])
+		echo "Please path OpenFace directory"
+		exit 1;
+	        ;;
+            [nN][oO]|[nN])
+		cd local/installations
+		$MAIN_ROOT/tools/installers/install_openface.sh || exit 1;
+		cd ../..
+     		break
+		;;
+        esac      
+    done
+fi
+
+if [ -d "$VIDAUG_DIR" ] ; then
+    echo "Vidaug already installed."
+else
+    while true
+    do
+        read -r -p "Have you already installed Vidaug on your computer [Y/n] " input
+        case $input in 
+	    [yY][eE][sS]|[yY])
+		echo "Please path Vidaug directory"
+		exit 1;
+	        ;;
+            [nN][oO]|[nN])
+		cd local/installations
+		$MAIN_ROOT/tools/installers/install_vidaug.sh $MAIN_ROOT || exit 1;
+		cd ../..
+     		break
+		;;
+       	 esac      
+    done
+fi
+
+if [ -d "$DEEPXI_DIR" ] ; then
+    echo "DeepXi already installed."
+else
+    while true
+    do
+        read -r -p "Have you already installed DeepXi on your computer [Y/n] " input
+        case $input in 
+	    [yY][eE][sS]|[yY])
+		echo "Please path DeepXi directory"
+		exit 1;
+	        ;;
+            [nN][oO]|[nN])
+		cd local/installations
+		$MAIN_ROOT/tools/installers/install_deepxi.sh || exit 1;
+		cd ../..
+     		break
+		;;
+        esac      
+    done
+fi
+exit 0
diff --git a/egs/lrs/avsr1/local/se_batch.py b/egs/lrs/avsr1/local/se_batch.py
new file mode 100755
index 00000000000..c5f0a58bf6b
--- /dev/null
+++ b/egs/lrs/avsr1/local/se_batch.py
@@ -0,0 +1,61 @@
+""" AUTHOR:         Aaron Nicolson
+AFFILIATION:    Signal Processing Laboratory, Griffith University.
+
+This Source Code Form is subject to the terms of the Mozilla Public
+License, v. 2.0. If a copy of the MPL was not distributed with this
+file, You can obtain one at http://mozilla.org/MPL/2.0/."""
+
+from deepxi.utils import read_wav
+import glob
+import numpy as np
+import os
+
+
+def Batch(fdir, snr_l=[]):
+    """REQUIRES REWRITING. WILL BE MOVED TO deepxi/utils.py
+
+    Places all of the test waveforms from the list into a numpy array.
+    SPHERE format cannot be used. 'glob' is used to support Unix style pathname
+    pattern expansions. Waveforms are padded to the maximum waveform length. The
+    waveform lengths are recorded so that the correct lengths can be sliced
+    for feature extraction. The SNR levels of each test file are placed into a
+    numpy array. Also returns a list of the file names.
+
+    Inputs:
+            fdir - directory containing the waveforms.
+            fnames - filename/s of the waveforms.
+            snr_l - list of the SNR levels used.
+
+    Outputs:
+            wav_np - matrix of paded waveforms stored as a numpy array.
+            len_np - length of each waveform strored as a numpy array.
+            snr_test_np - numpy array of all the SNR levels for the test set.
+            fname_l - list of filenames.
+
+    """
+    fname_l = []  # list of file names.
+    wav_l = []  # list for waveforms.
+    snr_test_l = []  # list of SNR levels for the test set.
+    # if isinstance(fnames, str): fnames = [fnames] # if string, put into list.
+    fnames = ["*.wav", "*.flac", "*.mp3"]
+    for fname in fnames:
+        for fpath in glob.glob(os.path.join(fdir, fname)):
+            for snr in snr_l:
+                if fpath.find("_" + str(snr) + "dB") != -1:
+                    snr_test_l.append(snr)  # append SNR level.
+            (wav, _) = read_wav(fpath)  # read waveform from given file path.
+            if len(wav.shape) == 2:
+                wav = wav[:, 0]
+            if np.isnan(wav).any() or np.isinf(wav).any():
+                raise ValueError("Error: NaN or Inf value.")
+            wav_l.append(wav)  # append.
+            fname_l.append(os.path.basename(os.path.splitext(fpath)[0]))  # append name.
+    len_l = []  # list of the waveform lengths.
+    maxlen = max(len(wav) for wav in wav_l)  # maximum length of waveforms.
+    wav_np = np.zeros(
+        [len(wav_l), maxlen], np.int16
+    )  # numpy array for waveform matrix.
+    for (i, wav) in zip(range(len(wav_l)), wav_l):
+        wav_np[i, : len(wav)] = wav  # add waveform to numpy array.
+        len_l.append(len(wav))  # append length of waveform to list.
+    return wav_np, np.array(len_l, np.int32), np.array(snr_test_l, np.int32), fname_l
diff --git a/egs/lrs/avsr1/local/show_result.sh b/egs/lrs/avsr1/local/show_result.sh
new file mode 100755
index 00000000000..35f5915cfbf
--- /dev/null
+++ b/egs/lrs/avsr1/local/show_result.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+mindepth=0
+maxdepth=1
+
+. utils/parse_options.sh
+
+if [ $# -gt 2 ]; then
+    echo "Usage: $0 --mindepth 0 --maxdepth 1 [exp]" 1>&2
+    echo ""
+    echo "Show the system environments and the evaluation results in Markdown format."
+    echo 'The default of <exp> is "exp/".'
+    exit 1
+fi
+
+[ -f ./path.sh ] && . ./path.sh
+set -euo pipefail
+if [ $# -eq 1 ]; then
+    exp=$1
+else
+    exp=$1
+    savedir=$2
+fi
+
+
+cat << EOF
+<!-- Generated by $0 -->
+# RESULTS
+## Environments
+- date: \`$(LC_ALL=C date)\`
+EOF
+
+python << EOF
+import sys, espnet, chainer, torch
+pyversion = sys.version.replace('\n', ' ')
+
+print(f"""- python version: \`{pyversion}\`
+- espnet version: \`espnet {espnet.__version__}\`
+- chainer version: \`chainer {chainer.__version__}\`
+- pytorch version: \`pytorch {torch.__version__}\`""")
+EOF
+
+cat << EOF
+- Git hash: \`$(git rev-parse HEAD)\`
+  - Commit date: \`$(git log -1 --format='%cd')\`
+
+EOF
+
+while IFS= read -r expdir; do
+    if ls ${expdir}/decode_*/result.txt &> /dev/null; then
+    # 1. Show the result table
+    cat << EOF
+## $(basename ${expdir})
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+EOF
+        grep -e Avg ${expdir}/decode_*/result.txt \
+            | sed -e "s#${expdir}/\([^/]*\)/result.txt:#|\1#g" \
+            | sed -e 's#Sum/Avg##g' | tr '|' ' ' | tr -s ' ' '|'
+        echo
+
+        # 2. Show the result table for WER
+        if ls ${expdir}/decode_*/result.wrd.txt &> /dev/null; then
+            cat << EOF
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+EOF
+            grep -e Avg ${expdir}/decode_*/result.wrd.txt \
+                | sed -e "s#${expdir}/\([^/]*\)/result.wrd.txt:#|\1#g" \
+                | sed -e 's#Sum/Avg##g' | tr '|' ' ' | tr -s ' ' '|'
+            echo
+        fi
+    fi
+done < <(find ${exp} -mindepth ${mindepth} -maxdepth ${maxdepth} -type d) >>$savedir
diff --git a/egs/lrs/avsr1/path.sh b/egs/lrs/avsr1/path.sh
new file mode 100755
index 00000000000..aa33934494e
--- /dev/null
+++ b/egs/lrs/avsr1/path.sh
@@ -0,0 +1,18 @@
+MAIN_ROOT=$PWD/../../..
+KALDI_ROOT=$MAIN_ROOT/tools/kaldi
+
+
+
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/featbin:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$MAIN_ROOT/tools/chainer_ctc/ext/warp-ctc/build
+. "${MAIN_ROOT}"/tools/activate_python.sh && . "${MAIN_ROOT}"/tools/extra_path.sh
+export PATH=$MAIN_ROOT/utils:$MAIN_ROOT/espnet/bin:$PATH
+
+export OMP_NUM_THREADS=1
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
diff --git a/egs/lrs/avsr1/run.sh b/egs/lrs/avsr1/run.sh
new file mode 100755
index 00000000000..6e295ad5290
--- /dev/null
+++ b/egs/lrs/avsr1/run.sh
@@ -0,0 +1,1050 @@
+#!/usr/bin/env bash
+
+# Copyright 2020 Ruhr-University (Wentao Yu)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+
+
+# general configuration
+ifpretrain=true			# if use LRS2 pretrain set 
+iflrs3pretrain=true		# if use LRS3 pretrain set
+ifsegment=true  		# if do segmentation for pretrain set
+ifcuda=true  			# if use cuda
+ifmulticore=true        	# if multi cpu processing, default is true in all scripts
+num=  			# this variable is related with next variable. Only applies when ifdebug=true
+ifdebug=false	   		# with debug, we only use $num Utts from pretrain and $num Utts from Train set
+backend=pytorch
+stage=-1			# start from -1 if you need to start from data download
+stop_stage=100			# stage at which to stop
+dataprocessingstage=0		# stage for data processing in stage 3
+stop_dataprocessingstage=100	# stage at which to stop
+ngpu=1       			# number of gpus ("0" uses cpu, otherwise use gpu)
+nj=16
+debugmode=1
+dumpdir=dump   			# directory to dump full features
+N=0            			# number of minibatches to be used (mainly for debugging). "0" uses all minibatches.
+verbose=0      			# verbose option
+train_lm=false			# true: Train own language model, false: use pretrained librispeech LM model
+
+# Setting path variables for dataset, OpenFace, DeepXi, pretrained model and musan
+# Change this variables and adapt it to your Folder structure
+DATA_DIR=					# The LRS2 dataset directory e.g. "/home/foo/LRS2"
+DATALRS3_DIR=				# The LRS3 dataset directory e.g. "/home/foo/LRS3"
+PRETRAINEDMODEL=pretrainedvideomodel/Video_only_model.pt 				        # Path to pretrained video model e.g. "pretrainedvideomodel/Video_only_model.pt"
+MUSAN_DIR="musan"   					              	#  The noise dataset directory e.g. "musan" 
+
+# feature configuration
+do_delta=false
+
+preprocess_config=conf/specaug.yaml
+train_config=conf/train.yaml 
+lm_config=conf/lm.yaml
+decode_config=conf/decode.yaml
+
+# rnnlm related
+lm_resume=        # specify a snapshot file to resume LM training
+lmtag=            # tag for managing LMs
+
+# bpemode (unigram or bpe)
+nbpe=500
+bpemode=unigram
+
+# exp tag
+tag="" # tag for managing experiments.
+
+. utils/parse_options.sh || exit 1;
+
+## Function for pretrained Librispeech language model:  
+function gdrive_download () {
+    CONFIRM=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate \
+        "https://docs.google.com/uc?export=download&id=$1" -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')
+    wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$CONFIRM&id=$1" -O $2
+    rm -rf /tmp/cookies.txt
+}
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+# define sets
+if [ "$ifpretrain" = true ] ; then
+	train_set="pretrain_Train"
+else
+	train_set="Train"
+fi
+train_dev="Val"
+recog_set="Val Test"
+
+
+
+
+# Stage -1: download local folder 
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    # download required files for data processing
+    local/download.sh
+fi
+
+# Stage 0: install software
+OPENFACE_DIR=local/installations/OpenFace/build/bin	# Path to OpenFace build directory
+VIDAUG_DIR=local/installations/vidaug 		 	# Path to vidaug directory
+DEEPXI_DIR=local/installations/DeepXi 			# DeepXi directory
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # Install required softwares
+    local/installpackage.sh $OPENFACE_DIR $VIDAUG_DIR $DEEPXI_DIR
+fi
+
+# Stage 1: Data preparation
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    ### Task dependent. You have to make data the following preparation part by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+    echo "stage 1: Data preparation"
+
+    echo "Download pretrained video feature extractor and check directory configuration"
+    if [ -f "$PRETRAINEDMODEL" ] ; then
+	echo "pretrained video feature extractor already exists"
+    else
+        gdrive_download '1ITgdZoa8vQ7lDwi1jLziYGXOyUtgE2ow' 'model.v1.tar.gz'  || exit 1;
+        tar -xf model.v1.tar.gz  || exit 1;
+        mv model.v1/avsrlrs2_3/pretrainedvideomodel ./
+        rm -rf model.v1
+        rm -rf model.v1.tar.gz
+    fi
+
+    if [ -d "$DATA_DIR" ] ; then
+	echo "Dataset already exists."
+    else
+	echo "For downloading the data, please visit 'https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html'."
+	echo "You will need to sign a Data Sharing agreement with BBC Research & Development before getting access."
+	echo "Please download the dataset by yourself and save the dataset directory in path.sh file"
+	echo "Thanks!"
+    fi
+	
+    if [ "$iflrs3pretrain" = true ] ; then
+    	if [ -d "$DATALRS3_DIR" ]; then
+    	    echo "Dataset already exists."
+    	else
+    	    echo "For downloading the data, please visit 'https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs3.html'."
+    	    echo "You will need to sign a Data Sharing agreement with BBC Research & Development before getting access."
+    	    echo "Please download the dataset by yourself and save the dataset directory in path.sh file"
+    	    echo "Thanks!"
+    	fi
+    fi
+
+    # Create Musan directory
+    if [ -d "${MUSAN_DIR}" ]; then
+	echo "MUSAN dataset is in ${MUSAN_DIR}..."
+    else
+	echo "Download MUSAN dataset"
+	wget --no-check-certificate http://www.openslr.org/resources/17/musan.tar.gz
+	echo "Download finished"
+	echo "Unzip MUSAN dataset"
+	tar -xf musan.tar.gz
+ 	rm -rf musan.tar.gz
+	echo "Unzipping finished"
+    fi   
+    # Create RIRS_NOISES Dataset
+    if [ -d "RIRS_NOISES" ]; then
+	echo "RIRS_NOISES dataset is in RIRS_NOISES..."
+    else
+    	# Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises
+	echo "Download RIRS_NOISES dataset"
+    	wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip
+	echo "Download finished"
+	echo "Unzip RIRS_NOISES dataset"
+    	unzip rirs_noises.zip
+    	rm -rf rirs_noises.zip
+	echo "Unzipping finished"
+    fi
+
+    for part in Test Val Train; do 
+        # use underscore-separated names in data directories. #Problem: Filelist_Val is readonly
+        local/data_prepare/lrs2_audio_data_prep.sh ${DATA_DIR} $part $ifsegment $ifmulticore $ifdebug $num $nj || exit 1;
+    done
+    if [ "$ifpretrain" = true ] ; then
+    	part=pretrain
+    	local/data_prepare/lrs2_audio_data_prep.sh ${DATA_DIR} $part $ifsegment $ifmulticore $ifdebug $num $nj || exit 1;
+    fi
+
+    if [ "$iflrs3pretrain" = true ] ; then
+
+	## embedding LRS3 code
+    	python3 -m venv --system-site-packages ./LRS3-env
+    	source ./LRS3-env/bin/activate
+    	pip3 install pydub
+    	local/data_prepare/lrs3_audio_data_prep.sh $DATALRS3_DIR pretrain $ifmulticore $ifsegment $ifdebug $num
+    	deactivate
+    	rm -rf ./LRS3-env
+        mkdir -p data/audio/clean/LRS3/pretrain
+	mv Dataset_processing/LRS3/kaldi/pretrainsegment/* data/audio/clean/LRS3/pretrain
+	cp Dataset_processing/LRS3/audio/pretrain/Filelist_pretrain Dataset_processing/LRS3/audio/pretrain/Filelist_LRS3pretrain
+	mv Dataset_processing/LRS3/audio/pretrain/Filelist_LRS3pretrain data/METADATA
+    fi
+    echo "stage 1: Data preparation finished"
+
+fi
+
+# Stage 2: Audio augmentation
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    ### Task dependent. You have to make data the following preparation part by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+    echo "stage 2: Audio augmentation"
+    for part in Test Val Train; do 
+        # use underscore-separated names in data directories.
+        local/extract_reliability/audio_augmentation.sh $MUSAN_DIR $part LRS2 || exit 1;
+    done
+
+    if [ "$ifpretrain" = true ] ; then
+    	part=pretrain
+    	local/extract_reliability/audio_augmentation.sh $MUSAN_DIR $part LRS2 || exit 1;
+    fi
+    if [ "$iflrs3pretrain" = true ] ; then
+    	part=pretrain
+    	local/extract_reliability/audio_augmentation.sh $MUSAN_DIR $part LRS3 || exit 1;
+    fi
+    # The Test set is augmented with ambient and music noise SNR from -12 to 12
+    local/extract_reliability/audio_augmentation_recog.sh $MUSAN_DIR Test LRS2 || exit 1;
+    echo "Datasets Combination"
+    if [[ "$ifpretrain" = true || "$iflrs3pretrain" = true ]] ; then ## combine pretrain and train set
+ 	if [[ "$ifpretrain" = true && "$iflrs3pretrain" = false ]] ; then
+		utils/combine_data.sh data/audio/augment/pretrain_Train_aug \
+					data/audio/augment/LRS2_Train_aug \
+					data/audio/augment/LRS2_pretrain_aug || exit 1;
+		utils/combine_data.sh data/audio/augment/pretrain_aug \
+					data/audio/augment/LRS2_pretrain_aug || exit 1;
+	elif [[ "$ifpretrain" = false && "$iflrs3pretrain" = true ]] ; then
+		utils/combine_data.sh data/audio/augment/pretrain_Train_aug \
+					data/audio/augment/LRS2_Train_aug \
+					data/audio/augment/LRS3_pretrain_aug || exit 1;
+		utils/combine_data.sh data/audio/augment/pretrain_aug \
+					data/audio/augment/LRS3_pretrain_aug || exit 1;
+	elif [[ "$ifpretrain" = true && "$iflrs3pretrain" = true ]] ; then
+		utils/combine_data.sh data/audio/augment/pretrain_Train_aug \
+					data/audio/augment/LRS2_Train_aug \
+					data/audio/augment/LRS2_pretrain_aug \
+					data/audio/augment/LRS3_pretrain_aug  || exit 1;
+		utils/combine_data.sh data/audio/augment/pretrain_aug \
+					data/audio/augment/LRS2_pretrain_aug \
+					data/audio/augment/LRS3_pretrain_aug || exit 1;
+	fi
+    fi
+    mv data/audio/augment/LRS2_Test_aug data/audio/augment/Test_aug
+    mv data/audio/augment/LRS2_Val_aug data/audio/augment/Val_aug
+    mv data/audio/augment/LRS2_Train_aug data/audio/augment/Train_aug	
+
+    echo "stage 2: Audio augmentation finished"
+
+fi
+
+mp3files=Dataset_processing/Audioaugments
+feat_tr_dir=${dumpdir}/audio_org/${train_set}/delta${do_delta}; mkdir -p ${feat_tr_dir}
+# Stage 3: Feature Generation for audio features
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    echo "stage 3: Feature Generation"
+    echo "stage 3.1: Make augmented mp3 files"
+    mkdir -p $mp3files
+    if [ "$ifpretrain" = false ] && [ "$iflrs3pretrain" = false ] ; then
+        for part in Test Val Train; do
+	    echo "Run audioaugwav frames for ${part} set!" 
+ 	    mkdir -p ${mp3files}/$part
+            local/extract_reliability/audioaugwav.sh data/audio/augment/${part}_aug $mp3files/$part || exit 1;
+        done
+
+    else
+	for part in Test Val Train pretrain; do #Train pretrain 
+	    echo "Run audioaugwav frames for ${part} set!" 
+	    mkdir -p $mp3files/$part
+    	    local/extract_reliability/audioaugwav.sh data/audio/augment/${part}_aug $mp3files/$part || exit 1;
+        done
+
+	part=pretrain
+        python3 local/extract_reliability/segaugaudio.py $mp3files data/audio/augment $part $ifmulticore
+	rm -r ${mp3files:?}/${part:?}
+	mv ${mp3files}/${part}_aug $mp3files/${part}
+    fi
+    nameambient=noise
+    namemusic=music
+    name_list="${nameambient} ${namemusic}"
+    for name in ${name_list};do
+        dset=Test
+     	mkdir -p ${mp3files}/${dset}_${name}  || exit 1;
+	local/extract_reliability/audioaugwav.sh data/audio/augment/LRS2_decode/${dset}_aug_${name} $mp3files/${dset}_${name} || exit 1;
+    done
+    echo "stage 3.1: Make augmented mp3 files finished"
+
+    ### Task dependent. You have to design training and dev sets by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+    echo "stage 3.2: Feature Generation"
+
+    fbankdir=fbank
+    mfccdir=mfccs
+    if [[ "$ifpretrain" = true || "$iflrs3pretrain" = true ]] ; then ## combine pretrain and train set
+	# Generate the fbank and mfcc features; by default 80-dimensional fbanks with pitch on each frame
+
+	mv data/audio/augment/pretrain_aug/segments data/audio/augment/pretrain_aug/segments_old
+	mv data/audio/augment/pretrain_Train_aug/segments data/audio/augment/pretrain_Train_aug/segments_old
+	for x in pretrain Train Test Val; do #pretrain_Train pretrain Train
+	    mv data/audio/augment/${x}_aug/wav.scp data/audio/augment/${x}_aug/wavnew.scp
+	    python3 local/extract_reliability/remakewav.py data/audio/augment/${x}_aug/wavnew.scp data/audio/augment/${x}_aug/wav.scp Dataset_processing/Audioaugments/$x
+	    cp -R data/audio/augment/${x}_aug data/audio/augment/${x}mfccs_aug
+	    mv data/audio/augment/${x}_aug data/audio/augment/${x}fbank_aug
+	    steps/make_mfcc.sh \
+		--cmd "$train_cmd" \
+		--nj $nj \
+		--write_utt2num_frames true \
+			data/audio/augment/${x}mfccs_aug \
+			exp/make_mfcc/${x} \
+			${mfccdir}  || exit 1;
+            utils/fix_data_dir.sh data/audio/augment/${x}mfccs_aug  || exit 1;
+	    steps/make_fbank_pitch.sh \
+		--cmd "$train_cmd" \
+		--nj $nj \
+		--write_utt2num_frames true \
+			data/audio/augment/${x}fbank_aug \
+			exp/make_fbank/${x} \
+			${fbankdir}  || exit 1;
+            utils/fix_data_dir.sh data/audio/augment/${x}fbank_aug  || exit 1;
+	done
+
+	utils/combine_data.sh data/audio/augment/pretrain_Trainfbank_aug \
+					data/audio/augment/pretrainfbank_aug \
+					data/audio/augment/Trainfbank_aug  || exit 1;
+	utils/combine_data.sh data/audio/augment/pretrain_Trainmfccs_aug \
+					data/audio/augment/pretrainmfccs_aug \
+					data/audio/augment/Trainmfccs_aug || exit 1;
+    else
+        # Generate the fbank and mfcc features; by default 80-dimensional fbanks with pitch on each frame
+	for x in Train Val Test; do #
+	    cp -R data/audio/augment/${x}_aug data/audio/augment/${x}mfccs_aug
+	    mv data/audio/augment/${x}_aug data/audio/augment/${x}fbank_aug
+	    steps/make_mfcc.sh \
+		--cmd "$train_cmd" \
+		--nj $nj \
+		--write_utt2num_frames true \
+			data/audio/augment/${x}mfccs_aug \
+			exp/make_mfcc/${x} \
+			${mfccdir}  || exit 1;
+	    utils/fix_data_dir.sh data/audio/augment/${x}mfccs_aug  || exit 1;
+	    steps/make_fbank_pitch.sh \
+		--cmd "$train_cmd" \
+		--nj $nj \
+		--write_utt2num_frames true \
+			data/audio/augment/${x}fbank_aug \
+			exp/make_fbank/${x} \
+			${fbankdir}  || exit 1;
+	    utils/fix_data_dir.sh data/audio/augment/${x}fbank_aug  || exit 1;
+	done
+    fi 
+
+    ## make fband and mfcc features for test decode dataset
+    x=Test
+    nameambient=noise
+    namemusic=music
+    name_list="${nameambient} ${namemusic}"
+    for name in ${name_list};do
+        rm -rf data/audio/augment/LRS2_decode/${x}mfccs_aug_${name}
+        rm -rf data/audio/augment/LRS2_decode/${x}fbank_aug_${name}
+	cp -R data/audio/augment/LRS2_decode/${x}_aug_${name} data/audio/augment/LRS2_decode/${x}mfccs_aug_${name}  || exit 1;
+	mv data/audio/augment/LRS2_decode/${x}_aug_${name} data/audio/augment/LRS2_decode/${x}fbank_aug_${name}  || exit 1;
+	steps/make_mfcc.sh \
+		--cmd "$train_cmd" \
+		--nj $nj \
+		--write_utt2num_frames true \
+		  data/audio/augment/LRS2_decode/${x}mfccs_aug_${name} \
+		  exp/make_mfcc/${x}_${name} ${mfccdir}  || exit 1;
+        utils/fix_data_dir.sh data/audio/augment/LRS2_decode/${x}mfccs_aug_${name}  || exit 1;
+	steps/make_fbank_pitch.sh \
+		--cmd "$train_cmd" \
+		--nj $nj \
+		--write_utt2num_frames true \
+		  data/audio/augment/LRS2_decode/${x}fbank_aug_${name} \
+		  exp/make_fbank/${x}_${name} ${fbankdir}  || exit 1;
+        utils/fix_data_dir.sh data/audio/augment/LRS2_decode/${x}fbank_aug_${name}  || exit 1;
+     done
+
+    # compute global CMVN
+    compute-cmvn-stats scp:data/audio/augment/${train_set}fbank_aug/feats.scp data/audio/augment/${train_set}fbank_aug/cmvn.ark  || exit 1;
+
+    # dump features
+    dump.sh  \
+	--cmd "$train_cmd" \
+	--nj $nj \
+	--do_delta ${do_delta} \
+	  data/audio/augment/${train_set}fbank_aug/feats.scp \
+	  data/audio/augment/${train_set}fbank_aug/cmvn.ark \
+	  exp/dump_feats/${train_set}fbank_aug ${feat_tr_dir}  || exit 1;
+
+    for rtask in ${recog_set} Train pretrain; do
+        feat_recog_dir=${dumpdir}/audio_org/${rtask}/delta${do_delta}; mkdir -p ${feat_recog_dir}
+        dump.sh \
+	    --cmd "$train_cmd" \
+            --nj $nj \
+            --do_delta ${do_delta} data/audio/augment/${rtask}fbank_aug/feats.scp \
+              data/audio/augment/${train_set}fbank_aug/cmvn.ark \
+              exp/dump_feats/recog/${rtask} \
+              ${feat_recog_dir}  || exit 1;
+    done
+
+    # make dump file for Test decode File
+    nameambient=noise
+    namemusic=music
+    name_list="${nameambient} ${namemusic}"
+    for name in ${name_list};do
+        feat_recog_dir=${dumpdir}/audio_org/Test_decode_${name}/delta${do_delta}; mkdir -p ${feat_recog_dir}
+        dump.sh \
+	    --cmd "$train_cmd" \
+            --nj $nj \
+            --do_delta ${do_delta} data/audio/augment/LRS2_decode/Testfbank_aug_${name}/feats.scp \
+              data/audio/augment/${train_set}fbank_aug/cmvn.ark \
+              exp/dump_feats/recog/Test_${name} \
+              ${feat_recog_dir}  || exit 1;
+    done
+
+    echo "stage 3.2: Audio Feature Generation finished"
+    echo "stage 3: Feature Generation finished"
+fi
+
+
+dict=data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt
+bpemodel=data/lang_char/${train_set}_${bpemode}${nbpe}
+echo "dictionary: ${dict}"
+# Stage 4: Dictionary and JSON Data Preparation
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    ### Task dependent. You have to check non-linguistic symbols used in the corpus.
+    echo "stage 4: Dictionary and Json Data Preparation"
+    if [ "$train_lm" = true ] ; then
+        mkdir -p data/lang_char/
+     	echo "<unk> 1" > ${dict} # <unk> must be 1, 0 will be used for "blank" in CTC
+    	cut -f 2- -d" " data/${train_set}/text > data/lang_char/input.txt
+    	spm_train --input=data/lang_char/input.txt \
+	    --vocab_size=${nbpe} \
+	    --model_type=${bpemode} \
+	    --model_prefix=${bpemodel} \
+	    --input_sentence_size=100000000  || exit 1;
+    	spm_encode --model=${bpemodel}.model \
+	    --output_format=piece < data/lang_char/input.txt | tr ' ' '\n' | sort | uniq | awk '{print $0 " " NR+1}' >> ${dict}  || exit 1;
+    	wc -l ${dict}
+    else
+    	# if using external librispeech lm        
+	gdrive_download '1ITgdZoa8vQ7lDwi1jLziYGXOyUtgE2ow' 'model.v1.tar.gz'  || exit 1;
+        tar -xf model.v1.tar.gz  || exit 1;
+	mv model.v1/avsrlrs2_3/exp/train_rnnlm_pytorch_lm_unigram500 exp/train_rnnlm_pytorch_lm_unigram500
+	mv model.v1/avsrlrs2_3/data/lang_char data/
+	mv data/lang_char/train_unigram500.model data/lang_char/${train_set}_unigram500.model
+    	mv data/lang_char/train_unigram500.vocab data/lang_char/${train_set}_unigram500.vocab
+    	mv data/lang_char/train_unigram500_units.txt data/lang_char/${train_set}_unigram500_units.txt
+	rm -rf model.v1
+	rm -rf model.v1.tar.gz
+
+    	##### it is depands on your corpus, if the corpus text transcription is uppercase, use this to convert to lowercase
+
+    	textfilenames="data/audio/augment/*/text"
+    	textdecodefilenames="data/audio/augment/LRS2_decode/*/text"
+    	textcleanfilenames="data/audio/clean/*/*/text"
+	for textname in $textfilenames $textdecodefilenames $textcleanfilenames; do
+    	    for textfilename in $textname
+    	    do
+	    	sed -r 's/([^ \t]+\s)(.*)/\1\L\2/' $textfilename > ${textfilename}1  || exit 1;
+	    	rm -rf $textfilename  || exit 1;
+	    	mv ${textfilename}1 $textfilename  || exit 1;
+    	    done
+   	done
+    fi
+
+    # make json labels
+    data2json.sh --feat ${feat_tr_dir}/feats.scp --bpecode ${bpemodel}.model \
+         data/audio/augment/${train_set}fbank_aug ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.json  || exit 1;
+    for rtask in ${recog_set} Train pretrain; do
+	sed -r 's/([^ \t]+\s)(.*)/\1\L\2/' data/audio/augment/${rtask}fbank_aug/text > data/audio/augment/${rtask}fbank_aug/text1  || exit 1;
+    	rm -rf data/audio/augment/${rtask}fbank_aug/text  || exit 1;
+    	mv data/audio/augment/${rtask}fbank_aug/text1 data/audio/augment/${rtask}fbank_aug/text  || exit 1;
+        
+	feat_recog_dir=${dumpdir}/audio_org/${rtask}/delta${do_delta}
+        data2json.sh --feat ${feat_recog_dir}/feats.scp --bpecode ${bpemodel}.model \
+            data/audio/augment/${rtask}fbank_aug ${dict} > ${feat_recog_dir}/data_${bpemode}${nbpe}.json  || exit 1;
+    done
+
+    ###make dump file for Test decode File
+    nameambient=noise
+    namemusic=music
+    name_list="${nameambient} ${namemusic}"
+    for name in ${name_list};do
+	feat_recog_dir=${dumpdir}/audio_org/Test_decode_${name}/delta${do_delta}
+        data2json.sh --feat ${feat_recog_dir}/feats.scp --bpecode ${bpemodel}.model \
+            data/audio/augment/LRS2_decode/Testfbank_aug_${name} ${dict} > ${feat_recog_dir}/data_${bpemode}${nbpe}.json  || exit 1;
+    done
+
+    echo "stage 4: Dictionary and Json Data Preparation finished"
+fi
+
+
+# Define new paths
+facerecog=Dataset_processing/Facerecog
+videoframe=Dataset_processing/Videodata
+videoaug=Dataset_processing/Videoaug
+videofeature=Dataset_processing/Videofeature
+SNRdir=Dataset_processing/SNRsmat
+SNRptdir=Dataset_processing/SNRs
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    echo "stage 5: Extract reliability measures"
+    if [ ${dataprocessingstage} -le 0 ] && [ ${stop_dataprocessingstage} -ge 0 ]; then
+	#make mfcc dump file
+	mkdir -p ${dumpdir}/mfcc/${train_set}/delta${do_delta}/  || exit 1;
+        cp data/audio/augment/${train_set}mfccs_aug/feats.scp ${dumpdir}/mfcc/${train_set}/delta${do_delta}/  || exit 1;
+    	data2json.sh --feat ${dumpdir}/mfcc/${train_set}/delta${do_delta}/feats.scp \
+	   --bpecode ${bpemodel}.model data/audio/augment/${train_set}mfccs_aug ${dict} \
+	   > ${dumpdir}/mfcc/${train_set}/delta${do_delta}/data_${bpemode}${nbpe}.json  || exit 1;
+    	for rtask in ${recog_set} Train pretrain; do
+            feat_recog_dir=${dumpdir}/mfcc/${rtask}/delta${do_delta}
+     	    mkdir -p $feat_recog_dir  || exit 1;
+	    cp data/audio/augment/${rtask}mfccs_aug/feats.scp ${dumpdir}/mfcc/${rtask}/delta${do_delta}/  || exit 1;
+            data2json.sh --feat ${feat_recog_dir}/feats.scp \
+	        --bpecode ${bpemodel}.model data/audio/augment/${rtask}mfccs_aug ${dict} \
+		  > ${dumpdir}/mfcc/${rtask}/delta${do_delta}/data_${bpemode}${nbpe}.json  || exit 1;
+        done
+
+        nameambient=noise
+        namemusic=music
+        name_list="${nameambient} ${namemusic}"
+        for name in ${name_list};do
+    	    dset=Test
+            feat_recog_dir=${dumpdir}/mfcc/Test_decode_${name}/delta${do_delta}
+     	    mkdir -p $feat_recog_dir  || exit 1;
+	    cp data/audio/augment/LRS2_decode/Testmfccs_aug_${name}/feats.scp ${dumpdir}/mfcc/Test_decode_${name}/delta${do_delta}/  || exit 1;
+            data2json.sh --feat ${feat_recog_dir}/feats.scp \
+	 	--bpecode ${bpemodel}.model data/audio/augment/LRS2_decode/Testmfccs_aug_${name} ${dict} \
+		  > ${dumpdir}/mfcc/Test_decode_${name}/delta${do_delta}/data_${bpemode}${nbpe}.json  || exit 1;
+
+            done
+    fi
+
+    if [ ${dataprocessingstage} -le 1 ] && [ ${stop_dataprocessingstage} -ge 1 ]; then
+	#Stage 5.1: Video augmentation with Gaussian blur and salt&pepper noise
+	if [ -d vidaug ]; then
+  	    echo "vidaug already exist..."
+	else
+  	    ln -s $VIDAUG_DIR vidaug
+	    ln -rsf local/extract_reliability/videoaug.py  vidaug/videoaug.py  
+	fi
+	python3 vidaug/videoaug.py data/METADATA/Filelist_Test $DATA_DIR $videoaug blur	# video augmentation with Gaussian blur
+	python3 vidaug/videoaug.py data/METADATA/Filelist_Test $DATA_DIR $videoaug saltandpepper # video augmentation with salt and pepper noise
+	unlink ./vidaug
+    fi
+
+    if [ ${dataprocessingstage} -le 2 ] && [ ${stop_dataprocessingstage} -ge 2 ]; then
+	#Stage 5.2: Video stream processing, using OpenFace for face recognition
+    	echo "stage 5.2: OpenFace face recognition"
+	mkdir -p $facerecog
+    	for part in Test Val Train; do  #
+	    echo "Starting OpenFace background processes for ${part} set!"  
+ 	    mkdir -p $facerecog/LRS2${part}
+            local/extract_reliability/Openface.sh $DATA_DIR $facerecog/LRS2${part} $part $OPENFACE_DIR \
+				LRS2 $nj $ifdebug || exit 1;
+    	done
+    	if [ "$ifpretrain" = true ] ; then
+    	    part=pretrain
+    	    echo "Starting OpenFace background processes for ${part} set!"  
+	    mkdir -p $facerecog/LRS2${part}
+    	    local/extract_reliability/Openface.sh $DATA_DIR $facerecog/LRS2${part} $part $OPENFACE_DIR \
+				LRS2 $nj $ifdebug || exit 1;
+   	fi
+    	if [ "$iflrs3pretrain" = true ] ; then
+    	    part=pretrain
+    	    echo "Starting OpenFace background processes for LRS3 ${part} set!"  
+	    mkdir -p $facerecog/LRS3${part}
+    	    local/extract_reliability/Openface.sh $DATALRS3_DIR $facerecog/LRS3${part} $part $OPENFACE_DIR \
+				LRS3 $nj $ifdebug || exit 1;
+   	fi
+	part=Test
+    	for noisetype in blur saltandpepper; do 
+	    echo "Starting OpenFace background processes for ${part} set!"  
+ 	    mkdir -p $facerecog/LRS2${part}_$noisetype
+            local/extract_reliability/Openface_vidaug.sh $videoaug $facerecog/LRS2${part}_$noisetype \
+				$part $OPENFACE_DIR LRS2 $noisetype $nj $ifdebug || exit 1;
+    	done
+
+	echo "All OpenFace background processes for all sets are done!"
+    fi
+
+    if [ ${dataprocessingstage} -le 3 ] && [ ${stop_dataprocessingstage} -ge 3 ]; then	
+	# Stage 5.3: Extract Video frames from the MP4 File by using OpenFace results
+	echo "stage 5.3: Extract Frames"
+	mkdir -p $videoframe
+
+    	if [ "$ifpretrain" = true ] ; then
+    	    part=pretrain
+	    echo "Extracting frames for ${part} set!" 
+	    mkdir -p $videoframe/LRS2${part}
+    	    local/extract_reliability/extractframs.sh $DATA_DIR \
+			$videoframe \
+			$facerecog \
+			data/audio/clean/LRS2 \
+			$part \
+			LRS2 \
+			$ifsegment \
+			$ifmulticore || exit 1;
+   	fi
+
+    	if [ "$iflrs3pretrain" = true ] ; then
+    	    part=pretrain
+	    echo "Extracting frames for ${part} set!" 
+	    mkdir -p $videoframe/LRS3${part}
+    	    local/extract_reliability/extractframs.sh $DATALRS3_DIR \
+			$videoframe \
+			$facerecog \
+			data/audio/clean/LRS3 \
+			$part \
+			LRS3 \
+			$ifsegment \
+			$ifmulticore || exit 1;
+   	fi
+
+	for part in Test Val Train; do  # Test 
+    	    echo "Extracting frames for ${part} set!"  	
+ 	    mkdir -p $videoframe/LRS2${part}
+            local/extract_reliability/extractframs.sh $DATA_DIR \
+			$videoframe \
+			$facerecog \
+			data/audio/clean/LRS2 \
+			$part \
+			LRS2 \
+			$ifsegment \
+			$ifmulticore || exit 1;
+    	done
+	part=Test
+    	for noisetype in blur saltandpepper; do 
+	    echo "Extracting frames for augumented ${part} set!" 
+	    mkdir -p $videoframe/LRS2${part}_$noisetype
+    	    local/extract_reliability/extractframs.sh $videoaug \
+			$videoframe \
+			$facerecog \
+			data/audio/clean/LRS2 \
+			$part \
+			LRS2 \
+			$ifsegment \
+			$ifmulticore \
+			$noisetype || exit 1;
+	done
+	echo "Extract Frames finished"	
+    fi
+
+    if [ ${dataprocessingstage} -le 4 ] && [ ${stop_dataprocessingstage} -ge 4 ]; then
+        # Stage 5.4: Use DeepXi to estimate SNR
+        echo "stage 5.4: Estimate SNRs using DeepXi framework"
+	if [ -d DeepXi ]; then
+  	    echo "Deepxi already exist..."
+	else
+  	    ln -s $DEEPXI_DIR DeepXi
+	fi
+	rm -rf DeepXi/set/test_noisy_speech
+	rm -rf DeepXi/deepxi/se_batch.py
+	cp local/se_batch.py DeepXi/deepxi
+        if [ "$ifpretrain" = false ] && [ "$iflrs3pretrain" = false ] ; then
+	    for part in Test Val Train; do
+		echo "Extract SNR for ${part} set!"
+ 		mkdir -p $SNRdir/$part
+                mkdir -p $SNRptdir/$part
+        	local/extract_reliability/extractsnr.sh $SNRdir $SNRptdir $mp3files $part $ifmulticore || exit 1;
+    	    done
+    	else	
+	    for part in Train pretrain Test Val; do  
+		echo "Extract SNR for ${part} set!" 
+		mkdir -p $SNRdir/$part
+                mkdir -p $SNRptdir/$part
+    		local/extract_reliability/extractsnr.sh $SNRdir $SNRptdir $mp3files $part $ifmulticore || exit 1;
+	    done
+   	fi
+	nameambient=noise
+        namemusic=music
+        name_list="${nameambient} ${namemusic}"
+        for name in ${name_list};do
+    	    dset=Test
+ 	    mkdir -p $SNRdir/${dset}_${name}
+     	    mkdir -p $SNRptdir/${dset}_${name}  || exit 1;
+	    local/extract_reliability/extractsnr.sh $SNRdir $SNRptdir $mp3files ${dset}_${name} $ifmulticore || exit 1;
+        done
+
+	# Clean Up DeepXi: unlink and rm DeepXi
+        unlink ./DeepXi
+	rm -rf $SNRdir
+    fi
+
+    if [ ${dataprocessingstage} -le 5 ] && [ ${stop_dataprocessingstage} -ge 5 ]; then
+	# Extract video features from video frames, if it is necessary
+	echo "stage 5.5: Extract video features"
+	mkdir -p $videofeature
+	for part in Test Val; do
+	    echo "Extract video features for ${part} set!"
+	    mkdir -p $videofeature/LRS2${part}
+	    local/extract_reliability/extractfeatures.sh $videoframe/LRS2${part}/Pics \
+				$videofeature/LRS2${part} \
+				$PRETRAINEDMODEL \
+				$part \
+				$ifcuda \
+				$ifdebug || exit 1;
+	done
+
+    	if [ "$ifpretrain" = true ] ; then
+	    part=pretrain
+	    echo "Extract video features for ${part} set!"
+	    mkdir -p $videofeature/LRS2${part}
+	    local/extract_reliability/extractfeatures.sh $videoframe/LRS2${part}/Pics \
+				$videofeature/LRS2${part} \
+				$PRETRAINEDMODEL \
+				$part \
+				$ifcuda \
+				$ifdebug || exit 1;
+   	fi
+
+    	if [ "$iflrs3pretrain" = true ] ; then
+	    part=pretrain
+	    echo "Extract video features for ${part} set!"
+	    mkdir -p $videofeature/LRS3${part}
+	    local/extract_reliability/extractfeatures.sh $videoframe/LRS3${part}/Pics \
+				$videofeature/LRS3${part} \
+				$PRETRAINEDMODEL \
+				$part \
+				$ifcuda \
+				$ifdebug || exit 1;
+   	fi
+	part=Test
+    	for noisetype in blur saltandpepper; do 
+	    echo "Extract video features for augmented ${part} set!"
+	    mkdir -p $videofeature/LRS2${part}_$noisetype
+    	    local/extract_reliability/extractfeatures.sh $videoframe/LRS2${part}_$noisetype/Pics \
+			$videofeature/LRS2${part}_$noisetype \
+			$PRETRAINEDMODEL \
+			$part \
+			$ifcuda \
+			$ifdebug || exit 1;
+	done
+    fi
+
+    if [ ${dataprocessingstage} -le 6 ] && [ ${stop_dataprocessingstage} -ge 6 ]; then	
+	# Make video ark files 
+	echo "stage 5.6: Make video ark files"
+
+	rm -rf data/video
+	python3 local/extract_reliability/tensor2ark.py $videofeature data/video $nj
+	for part in Test Val; do
+	    echo "Make video dump files for LRS2 ${part} set!"
+	    cat data/video/LRS2${part}/feats_*.scp > data/video/LRS2${part}/feats.scp || exit 1;
+            sort data/video/LRS2${part}/feats.scp -o data/video/LRS2${part}/feats.scp
+   	    mkdir -p ${dumpdir}/video/${part} || exit 1;
+	    for files in text wav.scp utt2spk; do
+		cp data/audio/clean/LRS2/${part}/${files} data/video/LRS2${part} || exit 1;
+	    done
+	    utils/fix_data_dir.sh data/video/LRS2${part}  || exit 1;
+	    cp data/video/LRS2${part}/feats.scp ${dumpdir}/video/${part} || exit 1;
+	    data2json.sh --feat ${dumpdir}/video/${part}/feats.scp --bpecode ${bpemodel}.model \
+         			data/video/LRS2${part} ${dict} > ${dumpdir}/video/${part}/data_${bpemode}${nbpe}.json  || exit 1;
+	done
+
+	if [[ "$ifpretrain" = true || "$iflrs3pretrain" = true ]] ; then
+	    part=pretrain
+ 	    if [[ "$ifpretrain" = true && "$iflrs3pretrain" = false ]] || [[ "$ifpretrain" = false && "$iflrs3pretrain" = true ]]; then
+		if [[ "$ifpretrain" = true && "$iflrs3pretrain" = false ]] ; then
+		    dataset=LRS2
+		elif [[ "$ifpretrain" = false && "$iflrs3pretrain" = true ]] ; then
+		    dataset=LRS3
+		fi
+		echo "Make video dump files for ${dataset} ${part} set!"
+		mkdir -p data/video/${part}
+	        cat data/video/${part}/feats_*.scp > data/video/${part}/feats.scp || exit 1;
+		sort data/video/${part}/feats.scp -o data/video/${part}/feats.scp
+   	        mkdir -p ${dumpdir}/video/${part} || exit 1;
+		for files in text wav.scp utt2spk; do
+		    cp data/audio/clean/${dataset}/${part}/${files} data/video/${part} || exit 1;
+		done
+		utils/fix_data_dir.sh data/video/${part}  || exit 1;
+		cp data/video/${part}/${part}/feats.scp ${dumpdir}/video/${part} || exit 1;
+	    elif [[ "$ifpretrain" = true && "$iflrs3pretrain" = true ]] ; then
+		echo "Make video dump files for LRS2 and LRS3 ${part} set!"
+	        cat data/video/LRS2${part}/feats_*.scp > data/video/LRS2${part}/feats.scp || exit 1;
+	        cat data/video/LRS3${part}/feats_*.scp > data/video/LRS3${part}/feats.scp || exit 1;
+		mkdir -p data/video/${part}
+   		mkdir -p ${dumpdir}/video/${part} || exit 1;
+		for files in text wav.scp utt2spk; do
+		    cat data/audio/clean/LRS2/${part}/${files} data/audio/clean/LRS3/${part}/${files} > data/video/${part}/${files} || exit 1;
+		    sort data/video/${part}/${files} -o data/video/${part}/${files}
+  		done
+		utils/fix_data_dir.sh data/video/${part}  || exit 1;
+		cat data/video/LRS2${part}/feats.scp data/video/LRS3${part}/feats.scp > ${dumpdir}/video/${part}/feats.scp || exit 1;
+		sort ${dumpdir}/video/${part}/feats.scp -o ${dumpdir}/video/${part}/feats.scp
+	    fi
+
+	    data2json.sh --feat ${dumpdir}/video/${part}/feats.scp --bpecode ${bpemodel}.model \
+         			data/video/${part} ${dict} > ${dumpdir}/video/${part}/data_${bpemode}${nbpe}.json || exit 1;
+
+	fi
+
+	part=Test
+    	for noisetype in blur saltandpepper; do 
+	    echo "Make video dump files for augmented ${part} set!"
+	    cat data/video/LRS2${part}_${noisetype}/feats_*.scp > data/video/LRS2${part}_${noisetype}/feats.scp || exit 1;
+ 	    sort data/video/LRS2${part}_${noisetype}/feats.scp -o data/video/LRS2${part}_${noisetype}/feats.scp
+   	    mkdir -p ${dumpdir}/video/${part}_decode_${noisetype} || exit 1;
+	    for files in text wav.scp utt2spk; do
+		cp data/audio/clean/LRS2/${part}/${files} data/video/LRS2${part}_${noisetype} || exit 1;
+	    done
+	    utils/fix_data_dir.sh data/video/LRS2${part}_${noisetype}  || exit 1;
+	    cp data/video/LRS2${part}_${noisetype}/feats.scp ${dumpdir}/video/${part}_decode_${noisetype} || exit 1;
+	    data2json.sh --feat ${dumpdir}/video/${part}_decode_${noisetype}/feats.scp --bpecode ${bpemodel}.model \
+         			data/video/LRS2${part}_${noisetype} ${dict} > ${dumpdir}/video/${part}_decode_${noisetype}/data_${bpemode}${nbpe}.json \
+                                || exit 1;
+	done
+    fi
+
+    if [ ${dataprocessingstage} -le 7 ] && [ ${stop_dataprocessingstage} -ge 7 ]; then
+	# Remake dump files
+	echo "stage 5.7: Remake audio and video dump files"
+
+	for dset in pretrain_Train Val Test Test_decode_music Test_decode_noise; do 
+            rm -rf dump/audio/$dset
+	    python3 local/dump/audiodump.py dump/audio dump/audio_org $dset $ifmulticore || exit 1;
+        done
+
+	for dset in pretrain Val Test; do 
+ 	    rm -rf dump/avpretrain/$dset
+	    python3 local/dump/avpretraindump.py dump/avpretrain dump/audio_org dump/video \
+						$SNRptdir $videoframe dump/mfcc \
+						$dset $ifmulticore || exit 1;
+        done
+
+	for dset in Train Val Test; do 
+            rm -rf dump/avtrain/$dset
+	    python3 local/dump/avtraindump.py dump/avtrain dump/audio_org $videofeature \
+						$SNRptdir $videoframe dump/mfcc \
+						$dset $ifmulticore || exit 1;
+        done
+
+	# Creat video dump file
+	for dset in pretrain Val Test; do 
+	    rm -rf dump/videopretrain/$dset
+	    python3 local/dump/videodump.py dump/avpretrain dump/videopretrain $dset || exit 1;
+        done
+
+	for dset in Train Val Test; do 
+	    rm -rf dump/videotrain/$dset
+	    python3 local/dump/videodump.py dump/avtrain dump/videotrain $dset || exit 1;
+        done
+
+	dset=Test
+	rm -rf dump/avpretraindecode
+	rm -rf dump/avtraindecode
+	for noisecombination in 'noise_None' 'music_None' 'noise_blur' 'noise_saltandpepper'; do 
+	    python3 local/dump/avpretraindecodedump.py dump/avpretraindecode dump/audio_org dump/video \
+				$SNRptdir $videoframe dump/mfcc \
+				$dset $noisecombination $ifmulticore || exit 1;
+	    python3 local/dump/avtraindecodedump.py dump/avtraindecode dump/audio_org dump/video \
+				$videofeature $SNRptdir $videoframe dump/mfcc \
+				$dset $noisecombination $ifmulticore || exit 1;
+	done
+
+    fi
+
+    if [ ${dataprocessingstage} -le 8 ] && [ ${stop_dataprocessingstage} -ge 8 ]; then
+	echo "stage 5.8: Split Test decode dump files"
+	for audionoise in noise music; do
+	    python3 local/extract_reliability/extractsnr.py data/audio/augment/LRS2_decode $audionoise $ifmulticore || exit 1;
+        done
+	for noisecombination in 'noise_None' 'music_None' 'noise_blur' 'noise_saltandpepper'; do 
+	    python3 local/extract_reliability/splitsnr.py dump/avpretraindecode $noisecombination data/audio/augment/LRS2_decode || exit 1;
+	    python3 local/extract_reliability/splitsnr.py dump/avtraindecode $noisecombination data/audio/augment/LRS2_decode || exit 1;
+	done
+    fi
+	
+    echo "stage 5: Reliability measures generation finished"
+fi
+
+# It takes a few days. If you just want to end-to-end ASR without LM,
+# you can skip this and remove --rnnlm option in the recognition (stage 6)
+# Otherwise, the pretrained Librispeech LM can be used (train_lm=false)
+
+if [ "$train_lm" = false ] ; then
+    lmexpname=train_rnnlm_pytorch_lm_unigram500
+    lmexpdir=exp/${lmexpname}
+else
+    if [ -z ${lmtag} ]; then
+        lmtag=$(basename ${lm_config%.*})
+    fi
+    lmexpname=train_rnnlm_${backend}_${lmtag}_${bpemode}${nbpe}_ngpu${ngpu}
+    lmexpdir=exp/${lmexpname}
+    mkdir -p ${lmexpdir}
+fi
+
+# Stage 6: Language Model (LM) preparation
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+    if [ "$train_lm" = false ] ; then
+        echo "stage 6: Use pretrained LM"
+    else
+        echo "stage 6: LM Preparation"
+        lmdatadir=data/local/lm_train_${bpemode}${nbpe}
+        # use external data
+        if [ ! -e data/local/lm_train/librispeech-lm-norm.txt.gz ]; then
+	    echo "Download Librispeech normnalized language model (LM) training text"
+            wget http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz -P data/local/lm_train/
+	    echo "Download finished"
+        fi
+		
+        if [ ! -e ${lmdatadir} ]; then
+	    echo "Prepare LM data"
+            mkdir -p ${lmdatadir}
+	    # build gzip archive for language data out of the utterances in the LRS dataset
+            cut -f 2- -d" " data/${train_set}/text | gzip -c > data/local/lm_train/${train_set}_text.gz
+            # combine external text and transcriptions and shuffle them with seed 777
+            zcat data/local/lm_train/librispeech-lm-norm.txt.gz data/local/lm_train/${train_set}_text.gz |\
+                spm_encode \
+                    --model=${bpemodel}.model \
+                    --output_format=piece \
+                > ${lmdatadir}/train.txt
+            cut -f 2- -d" " data/audio/augment/${train_dev}fbank_aug/text | \
+                spm_encode \
+                    --model=${bpemodel}.model \
+                    --output_format=piece \
+                > ${lmdatadir}/valid.txt
+	    echo "Preparation step done"
+        fi
+        echo "Start training Language Model"
+        ${cuda_cmd} --gpu ${ngpu} ${lmexpdir}/train.log \
+            lm_train.py \
+            --config ${lm_config} \
+            --ngpu ${ngpu} \
+            --backend ${backend} \
+            --verbose 1 \
+            --outdir ${lmexpdir} \
+            --tensorboard-dir tensorboard/${lmexpname} \
+            --train-label ${lmdatadir}/train.txt \
+            --valid-label ${lmdatadir}/valid.txt \
+            --resume ${lm_resume} \
+            --dict ${dict} \
+            --dump-hdf5-path ${lmdatadir}
+        echo "stage 6: LM Preparation finished"
+    fi
+fi
+
+if [ -z ${tag} ]; then
+    expname=${train_set}_${backend} #_$(basename ${train_config%.*})
+    if ${do_delta}; then
+        expname=${expname}_delta
+    fi
+    if [ -n "${preprocess_config}" ]; then 
+	expname=${expname}_$(basename ${preprocess_config%.*}) 
+    fi
+else
+    expname=${train_set}_${backend}_${tag}
+fi
+
+# ToDo: Hand over parameters for subscripts
+if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+    echo "Stage 7: Network Training"
+    # train audio model
+    expdirapretrain=exp/pretrain/A
+    mkdir -p ${expdirapretrain}
+    echo ${expdirapretrain}
+    noisetype=noise 	# Which noise type data is used for decoding, possible noisetype: noise music blur and saltandpepper
+    local/training/train_audio.sh --backend $backend \
+				--ngpu $ngpu \
+				--debugmode $debugmode \
+				--N $N \
+ 				--verbose $verbose \
+				--nbpe $nbpe \
+				--bpemode $bpemode \
+				--nj $nj \
+				--do_delta $do_delta \
+			 	--train_set $train_set \
+				--train_dev $train_dev \
+				--preprocess_config $preprocess_config \
+				--train_config $train_config\
+				--lm_config $lm_config \
+				--decode_config $decode_config\
+				$expdirapretrain dump/audio dump/avpretraindecode $lmexpdir $noisetype $dict $bpemodel || exit 1;
+
+    # pretrain video model
+    expdirvpretrain=exp/pretrain/V
+    mkdir -p ${expdirvpretrain}
+    echo ${expdirvpretrain}
+    noisetype=blur 	# Which noise type data is used for decoding, possible noisetype: noise music blur and saltandpepper
+    local/training/pretrain_video.sh --backend $backend \
+				--ngpu $ngpu \
+				--debugmode $debugmode \
+				--N $N \
+ 				--verbose $verbose \
+				--nbpe $nbpe \
+				--bpemode $bpemode \
+				--nj $nj \
+				--do_delta $do_delta \
+				--preprocess_config $preprocess_config \
+				--train_config $train_config\
+				--lm_config $lm_config \
+				--decode_config $decode_config\
+				 $expdirvpretrain dump/videopretrain dump/avpretraindecode $lmexpdir $noisetype $dict $bpemodel  || exit 1;
+
+    # finetune video model
+    expdirvfine=exp/fine/V
+    mkdir -p ${expdirvfine}
+    echo ${expdirvfine}
+    noisetype=blur 	# Which noise type data is used for decoding, possible noisetype: noise music blur and saltandpepper
+    local/training/finetune_video.sh --backend $backend \
+				--ngpu $ngpu \
+				--debugmode $debugmode \
+				--N $N \
+ 				--verbose $verbose \
+				--nbpe $nbpe \
+				--bpemode $bpemode \
+				--nj $nj \
+				--do_delta $do_delta \
+				--preprocess_config $preprocess_config \
+				--train_config $train_config\
+				--lm_config $lm_config \
+				--decode_config $decode_config\
+				 $expdirvfine $expdirvpretrain dump/videotrain dump/avtraindecode $PRETRAINEDMODEL $lmexpdir $noisetype $dict $bpemodel  || exit 1;
+
+    # pretrain audio-visual model
+    expdiravpretrain=exp/pretrain/AV
+    mkdir -p ${expdiravpretrain}
+    echo ${expdiravpretrain}
+    noisetype=noise 	# Which noise type data is used for decoding, possible noisetype: noise music blur and saltandpepper
+    local/training/pretrain_av.sh --backend $backend \
+				--ngpu $ngpu \
+				--debugmode $debugmode \
+				--N $N \
+ 				--verbose $verbose \
+				--nbpe $nbpe \
+				--bpemode $bpemode \
+				--nj $nj \
+				--do_delta $do_delta \
+				--preprocess_config $preprocess_config \
+				--train_config $train_config\
+				--lm_config $lm_config \
+				--decode_config $decode_config\
+			        $expdiravpretrain dump/avpretrain dump/avpretraindecode $lmexpdir \
+ 				$noisetype $dict $bpemodel $expdirapretrain $expdirvpretrain|| exit 1;
+
+    # finetune audio-visual model (final network used for decoding)
+    expdiravfine=exp/fine/AV
+    mkdir -p ${expdiravfine}
+    echo ${expdiravfine}
+    noisetype=noise 	# Which noise type data is used for decoding, possible noisetype: noise music blur and saltandpepper
+    local/training/finetune_av.sh --backend $backend \
+				--ngpu $ngpu \
+				--debugmode $debugmode \
+				--N $N \
+ 				--verbose $verbose \
+				--nbpe $nbpe \
+				--bpemode $bpemode \
+				--nj $nj \
+				--do_delta $do_delta \
+				--preprocess_config $preprocess_config \
+				--train_config $train_config\
+				--lm_config $lm_config \
+				--decode_config $decode_config\
+				$expdiravfine dump/avtrain dump/avtraindecode $PRETRAINEDMODEL $lmexpdir \
+ 				$noisetype $dict $bpemodel $expdiravpretrain|| exit 1;
+
+fi 
+
+exit 0
diff --git a/egs/lrs/avsr1/steps b/egs/lrs/avsr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs/lrs/avsr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs/lrs/avsr1/utils b/egs/lrs/avsr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs/lrs/avsr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs/lrs2/asr1/RESULTS.md b/egs/lrs2/asr1/RESULTS.md
new file mode 100644
index 00000000000..e1b03cf913a
--- /dev/null
+++ b/egs/lrs2/asr1/RESULTS.md
@@ -0,0 +1,39 @@
+## pretrain_Train_pytorch_train_specaug
+
+* Model files (archived to model.tar.gz by <code>$ pack_model.sh</code>)
+  - download link: <code>https://drive.google.com/file/d/1YUePEjk2Utgznr7sP0x4KdKCcPjbMM7C/view?usp=sharing</code>
+  - training config file: <code>conf/train.yaml</code>
+  - decoding config file: <code>conf/decode.yaml</code>
+  - preprocess config file: <code>conf/specaug.yaml</code>
+  - lm config file: <code>conf/lm.yaml</code> 
+  - cmvn file: <code>data/pretrain_Train/cmvn.ark</code>
+  - e2e file: <code>exp/pretrain_Train_pytorch_train_specaug/results/model.val5.avg.best</code>
+  - e2e json file: <code>exp/pretrain_Train_pytorch_train_specaug/results/model.json</code>
+  - lm file: <code>exp/pretrainedlm/rnnlm.model.best</code>
+  - lm JSON file: <code>exp/pretrainedlm/model.json</code>
+  - dict file: <code>data/lang_char/pretrain_Train_unigram5000_units.txt</code>
+
+
+## Environments
+- date: `Wed Feb 16 09:06:58 CET 2022`
+- python version: `3.8.5 (default, Sep  4 2020, 07:30:14)  [GCC 7.3.0]`
+- espnet version: `espnet 0.9.8`
+- chainer version: `chainer 6.0.0`
+- pytorch version: `pytorch 1.4.0`
+- Git hash: `19aabb415657c05a45467f9d8bb612db4764f6a1`
+  - Commit date: `Tue Oct 19 12:00:34 2021 +0200`
+
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_Test_model.val5.avg.best_decode_|1243|12648|96.3|1.6|2.1|0.2|3.9|15.8| 
+|decode_Val_model.val5.avg.best_decode_|1082|14858|92.7|3.2|4.1|0.9|8.2|38.2|
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_Test_model.val5.avg.best_decode_|1243|6660|96.2|2.1|1.7|0.4|4.2|15.7|
+|decode_Val_model.val5.avg.best_decode_|1082|7866|91.6|4.7|3.7|1.0|9.4|38.2|
diff --git a/egs/lrs2/asr1/cmd.sh b/egs/lrs2/asr1/cmd.sh
new file mode 100644
index 00000000000..7b70ef5e06e
--- /dev/null
+++ b/egs/lrs2/asr1/cmd.sh
@@ -0,0 +1,89 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/gpu.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs/lrs2/asr1/conf/decode.yaml b/egs/lrs2/asr1/conf/decode.yaml
new file mode 100644
index 00000000000..98b36d1752e
--- /dev/null
+++ b/egs/lrs2/asr1/conf/decode.yaml
@@ -0,0 +1,7 @@
+batchsize: 0
+beam-size: 60
+ctc-weight: 0.4
+lm-weight: 0.6
+maxlenratio: 0.0
+minlenratio: 0.0
+penalty: 0.0
diff --git a/egs/lrs2/asr1/conf/fbank.conf b/egs/lrs2/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs/lrs2/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs/lrs2/asr1/conf/gpu.conf b/egs/lrs2/asr1/conf/gpu.conf
new file mode 100644
index 00000000000..6d0a75b067a
--- /dev/null
+++ b/egs/lrs2/asr1/conf/gpu.conf
@@ -0,0 +1,10 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0
+option gpu=* -l 'hostname=b1[12345678]*|c*,gpu=$0' -q g.q
\ No newline at end of file
diff --git a/egs/lrs2/asr1/conf/lm.yaml b/egs/lrs2/asr1/conf/lm.yaml
new file mode 100644
index 00000000000..d0f3a0e0545
--- /dev/null
+++ b/egs/lrs2/asr1/conf/lm.yaml
@@ -0,0 +1,9 @@
+layer: 4
+unit: 2048
+opt: sgd       # or adam
+sortagrad: 0   # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+batchsize: 512 # batch size in LM training
+epoch: 20      # if the data size is large, we can reduce this
+patience: 3
+maxlen: 40     # if sentence length > lm_maxlen, lm_batchsize is automatically reduced
+dropout-rate: 0.0
diff --git a/egs/lrs2/asr1/conf/pitch.conf b/egs/lrs2/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs/lrs2/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs/lrs2/asr1/conf/queue.conf b/egs/lrs2/asr1/conf/queue.conf
new file mode 100644
index 00000000000..257d7b7b3aa
--- /dev/null
+++ b/egs/lrs2/asr1/conf/queue.conf
@@ -0,0 +1,10 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs/lrs2/asr1/conf/slurm.conf b/egs/lrs2/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs/lrs2/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs/lrs2/asr1/conf/specaug.yaml b/egs/lrs2/asr1/conf/specaug.yaml
new file mode 100644
index 00000000000..c0643d38597
--- /dev/null
+++ b/egs/lrs2/asr1/conf/specaug.yaml
@@ -0,0 +1,16 @@
+process:
+  # these three processes are a.k.a. SpecAugument
+  - type: "time_warp"
+    max_time_warp: 5
+    inplace: true
+    mode: "PIL"
+  - type: "freq_mask"
+    F: 30
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+  - type: "time_mask"
+    T: 40
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
diff --git a/egs/lrs2/asr1/conf/train.yaml b/egs/lrs2/asr1/conf/train.yaml
new file mode 100644
index 00000000000..342411d8c44
--- /dev/null
+++ b/egs/lrs2/asr1/conf/train.yaml
@@ -0,0 +1,40 @@
+# network architecture
+# encoder related
+elayers: 12
+eunits: 2048
+# decoder related
+dlayers: 6
+dunits: 2048
+# attention related
+adim: 256
+aheads: 4
+
+# hybrid CTC/attention
+mtlalpha: 0.3
+
+# label smoothing
+lsm-weight: 0.1
+
+# minibatch related
+batch-size: 32
+maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
+
+# optimization related
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+opt: noam
+accum-grad: 2
+grad-clip: 5
+patience: 0
+epochs: 100
+dropout-rate: 0.1
+
+# transformer specific setting
+backend: pytorch
+model-module: "espnet.nets.pytorch_backend.e2e_asr_transformer:E2E"
+transformer-input-layer: conv2d     # encoder architecture type
+transformer-lr: 5.0
+transformer-warmup-steps: 25000
+transformer-attn-dropout-rate: 0.0
+transformer-length-normalized-loss: false
+transformer-init: pytorch
diff --git a/egs/lrs2/asr1/local/README.md b/egs/lrs2/asr1/local/README.md
new file mode 100644
index 00000000000..9aaa11c64e2
--- /dev/null
+++ b/egs/lrs2/asr1/local/README.md
@@ -0,0 +1,52 @@
+# File Documentation
+The documentation is not finished. There are some files (especially in the subdirectories) without documentation right now.
+## Table of Contents 
+The documentation for the listed files is given below:
+- [data_preparation.sh](#data_preparation)
+- [make_files.py](#make_files)
+- [pretrain.py](#pretrain)
+
+---
+
+### data_preparation.sh
+**Short description:** Prepare Dataset basic structure script<br>
+**Parameters:**
+
+| Parameter Name | Function |
+|----------------|----------|
+| <code>sdir=$1</code> | source directory of the data |
+| <code>dset=$2</code> | dataset part (Train, Test, Val, pretrain) |
+| <code>segment=$3</code> | if do segmentation for pretrain set |
+| <code>nj=$4 </code> | if multi cpu processing, default is true |
+
+---
+
+### make_files.py
+**Short description:** Generate the text, utt2spk and wav.scp file<br>
+**Parameters:**
+
+| Parameter Name | Function |
+|----------------|----------|
+| <code>sys.argv[1]</code>, sourcedir (str) | The LRS2 dataset dir (e.g. /LRS2/data/lrs2_v1/mvlrs_v1/main) |  
+| <code>sys.argv[2]</code>, filelistdir (str) | The directory containing the dataset Filelists (METADATA) |
+| <code>sys.argv[3]</code>, savedir (str) | Save directory, datadir of the clean audio dataset  |
+| <code>sys.argv[4]</code>, dset (str) | Which set. There are pretrain, Train, Val, Test set |
+| <code>sys.argv[5]</code>, nj (str) | Number of multi processes |
+
+
+---
+
+### pretrain.py
+**Short description:** Prepare pretrain dataset<br>
+**Parameters:**
+
+| Parameter Name | Function |
+|----------------|----------|
+| <code>sys.argv[1]</code>, sourcedir (str) | The LRS2 dataset dir (e.g. /LRS2/data/lrs2_v1/mvlrs_v1/main) |  
+| <code>sys.argv[2]</code>, filelistdir (str) | The directory containing the dataset Filelists (METADATA) |
+| <code>sys.argv[3]</code>, savedir (str) | Save directory, datadir of the clean audio dataset |
+| <code>sys.argv[4]</code>, dset (str) | Which set. For this code dset is pretrain set | 
+| <code>sys.argv[5]</code>, nj (str) | Number of multi processes | 
+| <code>sys.argv[6]</code>, segment (str) |  If do segmentation | 
+
+
diff --git a/egs/lrs2/asr1/local/data_preparation.sh b/egs/lrs2/asr1/local/data_preparation.sh
new file mode 100644
index 00000000000..71be0ad3afd
--- /dev/null
+++ b/egs/lrs2/asr1/local/data_preparation.sh
@@ -0,0 +1,53 @@
+#! /usr/bin/env bash 
+
+# Copyright 2020 Ruhr-University (Wentao Yu)
+
+# hand over parameters 
+sdir=$1					# source directory of the data
+dset=$2					# dataset part (Train, Test, Val, pretrain)
+segment=$3				# if do segmentation for pretrain set
+nj=$4  		                       	# if multi cpu processing, default is true
+
+# general configuration
+stage=0                                 # set starting stage
+stop_stage=100                          # set stop stage
+sourcedir=$sdir/data/lrs2_v1/mvlrs_v1   # main data dir of LRS2 dataset, source for Video data
+datadir=data/$dset     			# datadir of the clean audio dataset 
+metadir=data/METADATA			# datadir of the metadata
+
+mkdir -p $datadir  
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # copy the Dataset metadata
+    tmpdir=$(mktemp -d tmp-XXXXX)
+    trap 'rm -rf ${tmpdir}' EXIT
+    mkdir -p ${tmpdir}/filelists
+    mkdir -p $metadir
+    if [ "$dset" = Val ] ; then
+	if [ ! -f "$metadir/Filelist_Val" ]; then
+	    cp $sdir/Filelist_${dset} $metadir
+	fi	 	    
+    else
+	cp -f $sdir/Filelist_${dset} $metadir
+    fi	
+    if [ "$dset" = Test ] ; then
+	mv $metadir/Filelist_Test ${tmpdir}/filelists/Filelist_Test
+	cat ${tmpdir}/filelists/Filelist_Test | cut -d " " -f1 > $metadir/Filelist_Test
+    fi
+fi
+
+if [ "$dset" = pretrain ] ; then
+    echo "pretrain"
+    segmentdir=data/Dataset_processing/pretrainsegment
+    mkdir -p $segmentdir
+    python3 -u local/pretrain.py  $sourcedir/pretrain $metadir $datadir $dset $nj $segment || exit 1;
+else
+   echo $dset
+   ### Generate the text, utt2spk and wav.scp file
+   python3 -u local/make_files.py  $sourcedir/main $metadir $datadir $dset $nj || exit 1;
+   for file in text utt2spk wav.scp; do
+	sort -u $datadir/$file -o $datadir/$file || exit 1;
+   done
+fi
+
+exit 0
diff --git a/egs/lrs2/asr1/local/make_files.py b/egs/lrs2/asr1/local/make_files.py
new file mode 100644
index 00000000000..171df026adb
--- /dev/null
+++ b/egs/lrs2/asr1/local/make_files.py
@@ -0,0 +1,94 @@
+import multiprocessing as mp
+import os
+import sys
+
+
+def main(sourcedir, filelistdir, savedir, dset, nj):
+    """Prepare the Kaldi files.
+
+    Args:
+        sourcedir (str): LRS2 dataset dir.
+        filelist (str): The dir of the mp4 file, it should be like
+                        '5535415699068794046/00001'
+        savedir (str): The dir save the Kaldi files.
+        dset (str): Which set. For this code dset is pretrain set.
+        nj (str): Number of multi processes.
+
+    """
+    nj = int(nj)
+    if nj > 1:
+        multicore = True
+    else:
+        multicore = False
+    filelistdir = filelistdir + "/" + "Filelist_" + dset
+    with open(filelistdir) as filelists:
+        filelist = filelists.readlines()
+    for i in range(len(filelist)):
+        filelist[i] = filelist[i].strip("\n")
+    if multicore is True:
+        pool = mp.Pool(nj)
+        job_args = [(i, dset, savedir, sourcedir) for i in filelist]
+        pool.map(product_helper, job_args)
+    else:
+        for i in filelist:
+            set(i, dset, savedir, sourcedir)
+
+
+def product_helper(args):
+    return set(*args)
+
+
+def set(info, s, savedir, sourcedir):
+    """Make the Kaldi files.
+
+    Args:
+        info (str): The file name.
+        s (str): Which set. For this code dset is pretrain set.
+        savedir (str): The dir save the Kaldi files.
+        sourcedir (str): LRS2 dataset dir.
+
+    """
+    textdir = savedir + "/text"
+    utt2spkdir = savedir + "/utt2spk"
+    wavdir = savedir + "/wav.scp"
+
+    info = info.split()
+    info[0] = info[0].split("/")
+    info[0] = "LRS2_" + info[0][0] + "_" + info[0][1] + "m"
+    name = info[0]
+    name = name.split("_")
+    f = os.path.join(name[1], name[2][:-1])
+
+    textfile = os.path.join(sourcedir, f + ".txt")
+    mp4dir = os.path.join(sourcedir, f + ".mp4")
+    with open(textfile) as filelists:
+        text = filelists.readlines()
+    text = text[0].split(":")[1]
+    splitname = f.split("/")
+    title = "LRS2_" + splitname[0] + "_" + splitname[1] + "m"
+    with open(textdir, "a") as textprocess:
+        textprocess.writelines(title + "" + text)
+        textprocess.close()
+
+    with open(utt2spkdir, "a") as utt:
+        utt.writelines(title + " LRS2_" + splitname[0] + "_" + splitname[1] + "m\n")
+        utt.close()
+
+    command1 = "ffmpeg -y -i"
+    command2 = "-vn -ac 2 -ar 16000 -ab 320k -f wav /tmp/tmp.$$; cat /tmp/tmp.$$ |\n"
+    wavscp = " ".join([title, command1, mp4dir, command2])
+    with open(wavdir, "a") as wav:
+        wav.writelines(wavscp)
+        wav.close()
+
+
+# hand over parameter overview
+# sys.argv[1] = sourcedir (str): The LRS2 dataset dir
+#                                (e.g. /LRS2/data/lrs2_v1/mvlrs_v1/main)
+# sys.argv[2] = filelistdir (str): The directory containing the dataset
+#                                 Filelists (METADATA)
+# sys.argv[3] = savedir (str): Save directory, datadir of the clean audio dataset
+# sys.argv[4] = dset (str): Which set. There are pretrain, Train, Val, Test set.
+# sys.argv[5] = nj (str): Number of multi processes.
+
+main(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5])
diff --git a/egs/lrs2/asr1/local/pretrain.py b/egs/lrs2/asr1/local/pretrain.py
new file mode 100644
index 00000000000..29d88ae0a80
--- /dev/null
+++ b/egs/lrs2/asr1/local/pretrain.py
@@ -0,0 +1,249 @@
+import multiprocessing as mp
+import os
+import sys
+
+
+def main(sourcedir, filelistdir, savedir, dset, nj, segment):
+    """Prepare the Kaldi files.
+
+    Args:
+        sourcedir (str): LRS2 dataset dir.
+        filelist (str): The dir of the mp4 file, it should be like
+                        '5535415699068794046/00001'
+        savedir (str): The dir save the Kaldi files.
+        dset (str): Which set. For this code dset is pretrain set.
+        nj (str): Number of multi processes.
+        segment (str): If use segmentation
+
+    """
+    nj = int(nj)
+    if nj > 1:
+        multicore = True
+    else:
+        multicore = False
+    if segment == "true":
+        segment = True
+    else:
+        segment = False
+    filelistdir = os.path.join(filelistdir, "Filelist_" + dset)
+    with open(filelistdir) as filelists:
+        filelist = filelists.readlines()
+    for i in range(len(filelist)):
+        filelist[i] = filelist[i].strip("\n")
+    filelist.sort()
+    if multicore is True:
+        pool = mp.Pool(nj)
+        job_args = [(i, dset, savedir, sourcedir, segment) for i in filelist]
+        pool.map(product_helper, job_args)
+    else:
+        for i in filelist:
+            set(i, dset, savedir, sourcedir, segment)
+
+
+def product_helper(args):
+    return set(*args)
+
+
+def remove(sub, s):
+    return s.replace(sub, "", -1)
+
+
+def segmentation(textfiledir, file, segment=True):
+    """Make segment information with segment interval 5s.
+
+    Args:
+        textfiledir (str): The Text and Segment File
+        file (str): The file name
+
+    """
+
+    with open(textfiledir) as filelists:
+        info = filelists.readlines()
+    info[0] = remove("Text:  ", info[0])
+    starttime = info[4].split(" ")[1]
+    endtime = float(info[-1].split(" ")[2])
+    segmentinfo = {file: {}}
+    if segment is False:
+        segmentinfo[file].update({str(0): {"segmenttime": [0.0, endtime]}})
+        segmentinfo[file][str(0)].update({"segmenttext": info[0]})
+        return segmentinfo
+    else:
+        if endtime > 6:
+            cutpoint = []
+            timer = 5
+            for j in range(4, len(info)):
+                wordendtime = float(info[j].split(" ")[2])
+                if wordendtime / timer > 1:
+                    cutpoint.append(j)
+                    timer = timer + 5
+            cuttime = []
+            for k in range(len(cutpoint)):
+                cuttime.append(info[cutpoint[k]].split(" ")[2])
+
+            cuttime = [starttime] + cuttime + [str(endtime)]
+            # if endtime - float(cuttime[-2]) <= 1.0:
+            #   del cuttime[-1]
+            for cutid in range(len(cuttime) - 1):
+                segmentinfo[file].update(
+                    {str(cutid): {"segmenttime": [cuttime[cutid], cuttime[cutid + 1]]}}
+                )
+
+            text = []
+            for m in range(4, len(info)):
+                text.append(info[m].split(" ")[0])
+            cuttext = [x - 4 for x in cutpoint]
+            cuttext.append(len(text))
+            textlen = []
+            for n in range(1, len(cuttext)):
+                textlen.append(cuttext[n] - cuttext[n - 1])
+            textlen = [cuttext[0] + 1] + textlen
+            textlen[-1] = textlen[-1] - 1
+            for p in range(len(textlen)):
+                temp = []
+                for q in range(textlen[p]):
+                    temp.append(text[q])
+                segmentinfo[file][str(p)].update({"segmenttext": " ".join(temp)})
+                text[0 : textlen[p]] = []
+
+            return segmentinfo
+
+        else:
+            segmentinfo[file].update({str(0): {"segmenttime": [0.0, endtime]}})
+            segmentinfo[file][str(0)].update({"segmenttext": info[0]})
+            return segmentinfo
+
+
+def set(file, s, savedir, sourcedir, segment):
+    """Make the Kaldi files.
+
+    Args:
+        file (str): The file name.
+        s (str): Which set. For this code dset is pretrain set.
+        sourcedir: LRS2 dataset dir.
+        savedir (str): The dir save the Kaldi files.
+
+    """
+
+    textdir = os.path.join(savedir, "text")
+    utt2spkdir = os.path.join(savedir, "utt2spk")
+    wavdir = os.path.join(savedir, "wav.scp")
+    segmentdir = os.path.join(savedir, "segments")
+
+    textfile = os.path.join(sourcedir, file + ".txt")
+    mp4dir = os.path.join(sourcedir, file + ".mp4")
+
+    if segment is False:
+        segmentinfo = segmentation(textfile, file, segment)
+        command1 = "ffmpeg -y -i"
+        command2 = (
+            "-vn -ac 1 -ar 16000 -ab 320k -f wav /tmp/tmp.$$; cat /tmp/tmp.$$ |\n"
+        )
+        splitname = file.split("/")
+        Title = "_".join(["LRS2", splitname[0], splitname[1] + "p"])
+        texttxt = [" ".join([Title, segmentinfo[file][str(0)]["segmenttext"]])]
+        wavtxt = [" ".join([Title, command1, mp4dir, command2])]
+        utttxt = [
+            Title + " " + "_".join(["LRS2", splitname[0], splitname[1] + "p"]) + "\n"
+        ]
+
+    else:
+        segdir = os.path.join(savedir, "seginfo.txt")
+        segmentinfo = segmentation(textfile, file)
+        if len(segmentinfo[file]) == 1:
+            starttime = float(segmentinfo[file][str(0)]["segmenttime"][0])
+            endtime = float(segmentinfo[file][str(0)]["segmenttime"][1])
+            command1 = "ffmpeg -y -i"
+            command2 = (
+                "-vn -ac 1 -ar 16000 -ab 320k -f wav /tmp/tmp.$$; cat /tmp/tmp.$$ |\n"
+            )
+            splitname = file.split("/")
+            Title = "_".join(
+                [
+                    "LRS2",
+                    splitname[0],
+                    splitname[1] + "p",
+                    str(int(starttime * 100)).zfill(7),
+                    str(int(endtime * 100)).zfill(7),
+                ]
+            )
+            spkerid = "_".join(["LRS2", splitname[0], splitname[1] + "p"])
+            texttxt = " ".join([Title, segmentinfo[file][str(0)]["segmenttext"]])
+            wavtxt = " ".join([spkerid, command1, mp4dir, command2])
+            segtxt = " ".join([Title, spkerid, str(starttime), str(endtime)]) + "\n"
+            utttxt = Title + " " + spkerid + "\n"
+
+        else:
+            splitname = file.split("/")
+            command1 = "ffmpeg -y -i"
+            command2 = (
+                "-vn -ac 1 -ar 16000 -ab 320k -f wav /tmp/tmp.$$; cat /tmp/tmp.$$ |\n"
+            )
+            spkerid = "_".join(["LRS2", splitname[0], splitname[1] + "p"])
+            wavtxt = [" ".join([spkerid, command1, mp4dir, command2])]
+            texttxt = []
+            utttxt = []
+            segtxt = []
+            segmentinfos = []
+            for i in range(len(segmentinfo[file])):
+                starttime = float(segmentinfo[file][str(i)]["segmenttime"][0])
+                endtime = float(segmentinfo[file][str(i)]["segmenttime"][1])
+                if segmentinfo[file][str(i)]["segmenttext"] == "":
+                    pass
+                else:
+                    Title = "_".join(
+                        [
+                            "LRS2",
+                            splitname[0],
+                            splitname[1] + "p",
+                            str(int(starttime * 100)).zfill(7),
+                            str(int(endtime * 100)).zfill(7),
+                        ]
+                    )
+                    spkerid = "_".join(["LRS2", splitname[0], splitname[1] + "p"])
+                    segtxt.append(
+                        " ".join([Title, spkerid, str(starttime), str(endtime)]) + "\n"
+                    )
+                    segmentinfos.append(
+                        " ".join([Title, mp4dir, str(starttime), str(endtime)]) + "\n"
+                    )
+
+                    temptext = " ".join(
+                        [Title, segmentinfo[file][str(i)]["segmenttext"]]
+                    )
+                    if "\n" in temptext:
+                        pass
+                    else:
+                        temptext = temptext + "\n"
+                    texttxt.append(temptext)
+                    utttxt.append(Title + " " + spkerid + "\n")
+
+            with open(segdir, "a") as segprocess:
+                segprocess.writelines(segmentinfos)
+                segprocess.close()
+
+    with open(textdir, "a") as textprocess:
+        textprocess.writelines(texttxt)
+        textprocess.close()
+    with open(utt2spkdir, "a") as utt:
+        utt.writelines(utttxt)
+        utt.close()
+    with open(wavdir, "a") as wav:
+        wav.writelines(wavtxt)
+        wav.close()
+    with open(segmentdir, "a") as segs:
+        segs.writelines(segtxt)
+        segs.close()
+
+
+# hand over parameter overview
+# sys.argv[1] = sourcedir (str): The LRS2 dataset dir
+#                                (e.g. /LRS2/data/lrs2_v1/mvlrs_v1/main)
+# sys.argv[2] = filelistdir (str): The directory containing the dataset
+#                                 Filelists (METADATA)
+# sys.argv[3] = savedir (str): Save directory, datadir of the clean audio dataset
+# sys.argv[4] = dset (str): Which set. For this code dset is pretrain set.
+# sys.argv[5] = nj (str): Number of multi processes.
+# sys.argv[6] = segment (str): If do segmentation.
+
+
+main(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5], sys.argv[6])
diff --git a/egs/lrs2/asr1/path.sh b/egs/lrs2/asr1/path.sh
new file mode 100644
index 00000000000..fcb3ae8dd5a
--- /dev/null
+++ b/egs/lrs2/asr1/path.sh
@@ -0,0 +1,17 @@
+MAIN_ROOT=$PWD/../../..
+KALDI_ROOT=$MAIN_ROOT/tools/kaldi
+
+
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$MAIN_ROOT/tools/chainer_ctc/ext/warp-ctc/build
+. "${MAIN_ROOT}"/tools/activate_python.sh && . "${MAIN_ROOT}"/tools/extra_path.sh
+export PATH=$MAIN_ROOT/utils:$MAIN_ROOT/espnet/bin:$PATH
+
+export OMP_NUM_THREADS=1
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
\ No newline at end of file
diff --git a/egs/lrs2/asr1/run.sh b/egs/lrs2/asr1/run.sh
new file mode 100644
index 00000000000..e4958dd9359
--- /dev/null
+++ b/egs/lrs2/asr1/run.sh
@@ -0,0 +1,361 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Ruhr-University Bochum (Wentao Yu)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+
+# general configuration
+backend=pytorch
+stage=-1       		# start from stage 0, stage -1 (Data Download has to be done by the user) 
+stop_stage=100		# stage at which to stop
+ngpu=1         		# number of gpus ("0" uses cpu, otherwise use gpu)
+nj=32
+debugmode=1
+dumpdir=dump   		# directory to dump full features
+N=0            		# number of minibatches to be used (mainly for debugging). "0" uses all minibatches.
+verbose=0      		# verbose option
+resume=        		# Resume the training from snapshot
+train_lm=false
+
+# feature configuration
+do_delta=false
+
+preprocess_config=conf/specaug.yaml
+train_config=conf/train.yaml 
+lm_config=conf/lm.yaml
+decode_config=conf/decode.yaml
+
+# rnnlm related
+lm_resume= # specify a snapshot file to resume LM training
+lmtag=     # tag for managing LMs
+
+# decoding parameter
+recog_model=model.acc.best  # set a model to be used for decoding: 'model.acc.best' or 'model.loss.best'
+lang_model=rnnlm.model.best # set a language model to be used for decoding
+
+# model average realted (only for transformer)
+n_average=5                  # the number of ASR models to be averaged
+use_valbest_average=true     # if true, the validation `n_average`-best ASR models will be averaged.
+                             # if false, the last `n_average` ASR models will be averaged.
+lm_n_average=0               # the number of languge models to be averaged
+use_lm_valbest_average=false # if true, the validation `lm_n_average`-best language models will be averaged.
+                             # if false, the last `lm_n_average` language models will be averaged.
+
+# The LRS2 Corpus requires vertification. You have to download the 
+# dataset and set your dataset dir here
+datadir=		     # The LRS2 dataset directory e.g. /home/foo/LRS2
+
+pretrain=true		     # if use LRS2 pretrain set 
+segment=true  		     # if do segmentation for pretrain set
+
+# bpemode (unigram or bpe)
+nbpe=500
+bpemode=unigram
+
+## train_lm=false, we have to download pretrained language model
+function gdrive_download () {
+  CONFIRM=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate "https://docs.google.com/uc?export=download&id=$1" -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')
+  wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$CONFIRM&id=$1" -O $2
+  rm -rf /tmp/cookies.txt
+}
+
+# exp tag
+tag="" # tag for managing experiments.
+
+. utils/parse_options.sh || exit 1;
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+# define sets
+if [ "$pretrain" = true ] ; then
+	train_set="pretrain_Train"
+else
+	train_set="Train"
+fi
+train_dev="Val"
+recog_set="Val Test"
+
+# Stage -1: Data Download
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    if [ -d "$datadir" ]; then
+    	echo "Dataset already exists."
+    else
+    	echo "For downloading the data, please visit 'https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html'."
+    	echo "You will need to sign a Data Sharing agreement with BBC Research & Development before getting access."
+    	echo "Please download the dataset by yourself and save the dataset directory in path.sh file"
+    	echo "Thanks!"
+    fi
+fi
+
+# Stage 0: Data preparation
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    echo "stage 0: Data preparation"
+    for part in Test Val Train; do 
+        local/data_preparation.sh $datadir $part $segment $nj || exit 1;
+    done
+    if [ "$pretrain" = true ] ; then
+    	part=pretrain
+    	local/data_preparation.sh $datadir $part $segment $nj || exit 1;
+    fi
+    for part in pretrain Test Val Train; do 
+    	mv data/${part} data/${part}_org || exit 1;
+    done
+    echo "stage 0: Data preparation finished"
+
+fi
+
+feat_tr_dir=${dumpdir}/${train_set}/delta${do_delta}; mkdir -p ${feat_tr_dir}
+feat_test_dir=${dumpdir}/Test/delta${do_delta}; mkdir -p ${feat_test_dir}
+feat_val_dir=${dumpdir}/Val/delta${do_delta}; mkdir -p ${feat_val_dir}
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    ### Task dependent. You have to design training and dev sets by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+    echo "stage 1: Feature Generation"
+
+    fbankdir=fbank
+    # Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame
+    for x in Train Val Test; do
+        steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj ${nj} --write_utt2num_frames true \
+            data/${x} exp/make_fbank/${x} ${fbankdir}
+        utils/fix_data_dir.sh data/${x}
+    done
+
+    if [ "$pretrain" = true ] ; then
+	remove_longshortdata.sh --maxframes 3000 --maxchars 400 data/pretrain_org data/pretrain
+        steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj ${nj} --write_utt2num_frames true \
+            data/pretrain exp/make_fbank/pretrain ${fbankdir}
+        utils/fix_data_dir.sh data/pretrain
+    	utils/combine_data.sh data/${train_set} \
+			      data/pretrain \
+			      data/Train
+    fi
+
+    # compute global CMVN
+    compute-cmvn-stats scp:data/${train_set}/feats.scp data/${train_set}/cmvn.ark
+
+    # dump features for training
+    dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta ${do_delta} \
+        data/${train_set}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/train_set ${feat_tr_dir=}
+    for rtask in ${recog_set}; do
+        feat_recog_dir=${dumpdir}/${rtask}/delta${do_delta}; mkdir -p ${feat_recog_dir}
+        dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta ${do_delta} \
+            data/${rtask}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/recog/${rtask} \
+            ${feat_recog_dir}
+    done
+fi
+
+
+dict=data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt
+bpemodel=data/lang_char/${train_set}_${bpemode}${nbpe}
+echo "dictionary: ${dict}"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    ### Task dependent. You have to check non-linguistic symbols used in the corpus.
+    echo "stage 2: Dictionary and Json Data Preparation"
+
+    if [ "$train_lm" = true ] ; then
+        mkdir -p data/lang_char/
+        echo "<unk> 1" > ${dict} # <unk> must be 1, 0 will be used for "blank" in CTC
+        cut -f 2- -d" " data/${train_set}/text > data/lang_char/input.txt
+        spm_train --input=data/lang_char/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000
+        spm_encode --model=${bpemodel}.model --output_format=piece < data/lang_char/input.txt | tr ' ' '\n' | sort | uniq | awk '{print $0 " " NR+1}' >> ${dict}
+        wc -l ${dict}
+
+    else
+	gdrive_download '1ITgdZoa8vQ7lDwi1jLziYGXOyUtgE2ow' 'model.v1.tar.gz'
+	tar -xf model.v1.tar.gz
+	mv model.v1/avsrlrs2_3/exp/train_rnnlm_pytorch_lm_unigram500 exp/pretrainedlm
+	mv model.v1/avsrlrs2_3/data/lang_char data/
+    	mv data/lang_char/train_unigram500.model data/lang_char/${train_set}_unigram500.model
+    	mv data/lang_char/train_unigram500.vocab data/lang_char/${train_set}_unigram500.vocab
+    	mv data/lang_char/train_unigram500_units.txt data/lang_char/${train_set}_unigram500_units.txt
+  	rm -rf model.v1
+	rm -rf model.v1.tar.gz
+	
+	##### it is depands on your corpus, if the corpus text transcription is uppercase, use this to convert to lowercase
+    	textfilenames1=data/${train_set}/text
+   	textfilenames2=data/Test/text	
+    	textfilenames3=data/Val/text	
+    	for textfilename in $textfilenames1 $textfilenames2 $textfilenames3
+    	do
+	    sed -r 's/([^ \t]+\s)(.*)/\1\L\2/' $textfilename > ${textfilename}1  || exit 1;
+	    rm -rf $textfilename  || exit 1;
+	    mv ${textfilename}1 $textfilename  || exit 1;
+    	done
+    fi
+
+    # make json labels
+    data2json.sh --nj ${nj} --feat ${feat_tr_dir}/feats.scp --bpecode ${bpemodel}.model \
+        data/${train_set} ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.json
+
+    for rtask in ${recog_set}; do
+        feat_recog_dir=${dumpdir}/${rtask}/delta${do_delta}
+        data2json.sh --nj ${nj} --feat ${feat_recog_dir}/feats.scp --bpecode ${bpemodel}.model \
+            data/${rtask} ${dict} > ${feat_recog_dir}/data_${bpemode}${nbpe}.json
+    done
+fi
+
+# You can skip this and remove --rnnlm option in the recognition (stage 5)
+if [ "$train_lm" = false ] ; then
+    lmexpname=pretrainedlm
+    lmexpdir=exp/${lmexpname}
+else
+    if [ -z ${lmtag} ]; then
+        lmtag=$(basename ${lm_config%.*})
+    fi
+    lmexpname=train_rnnlm_${backend}_${lmtag}_${bpemode}${nbpe}_ngpu${ngpu}
+    lmexpdir=exp/${lmexpname}
+    mkdir -p ${lmexpdir}
+fi
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    if [ "$train_lm" = false ] ; then
+        echo "stage 3: Use pretrained LM"
+    else
+        echo "stage 3: LM Preparation"
+        lmdatadir=data/local/lm_train_${bpemode}${nbpe}
+        # use external data
+        if [ ! -e data/local/lm_train/librispeech-lm-norm.txt.gz ]; then
+            wget http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz -P data/local/lm_train/
+        fi
+        if [ ! -e ${lmdatadir} ]; then
+            mkdir -p ${lmdatadir}
+            cut -f 2- -d" " data/${train_set}/text | gzip -c > data/local/lm_train/${train_set}_text.gz
+            # combine external text and transcriptions and shuffle them with seed 777
+            zcat data/local/lm_train/librispeech-lm-norm.txt.gz data/local/lm_train/${train_set}_text.gz |\
+                spm_encode --model=${bpemodel}.model --output_format=piece > ${lmdatadir}/train.txt
+            cut -f 2- -d" " data/${train_dev}/text | spm_encode --model=${bpemodel}.model --output_format=piece \
+                                                            > ${lmdatadir}/valid.txt
+        fi
+        ${cuda_cmd} --gpu ${ngpu} ${lmexpdir}/train.log \
+            lm_train.py \
+            --config ${lm_config} \
+            --ngpu ${ngpu} \
+            --backend ${backend} \
+            --verbose 1 \
+            --outdir ${lmexpdir} \
+            --tensorboard-dir tensorboard/${lmexpname} \
+            --train-label ${lmdatadir}/train.txt \
+            --valid-label ${lmdatadir}/valid.txt \
+            --resume ${lm_resume} \
+            --dict ${dict} \
+            --dump-hdf5-path ${lmdatadir}
+    fi
+fi
+
+
+if [ -z ${tag} ]; then
+    expname=${train_set}_${backend}_$(basename ${train_config%.*})
+    if ${do_delta}; then
+        expname=${expname}_delta
+    fi
+    if [ -n "${preprocess_config}" ]; then
+        expname=${expname}_$(basename ${preprocess_config%.*})
+    fi
+else
+    expname=${train_set}_${backend}_${tag}
+fi
+expdir=exp/${expname}
+mkdir -p ${expdir}
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    echo "stage 4: Network Training"
+    ${cuda_cmd} --gpu ${ngpu} ${expdir}/train.log \
+        asr_train.py \
+        --config ${train_config} \
+        --preprocess-conf ${preprocess_config} \
+        --ngpu ${ngpu} \
+        --backend ${backend} \
+        --outdir ${expdir}/results \
+        --tensorboard-dir tensorboard/${expname} \
+        --debugmode ${debugmode} \
+        --dict ${dict} \
+        --debugdir ${expdir} \
+        --minibatches ${N} \
+        --verbose ${verbose} \
+        --resume ${resume} \
+        --train-json ${feat_tr_dir}/data_${bpemode}${nbpe}.json \
+        --valid-json ${feat_val_dir}/data_${bpemode}${nbpe}.json
+fi
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    echo "stage 5: Decoding"
+    if [[ $(get_yaml.py ${train_config} model-module) = *transformer* ]] || \
+           [[ $(get_yaml.py ${train_config} model-module) = *conformer* ]] || \
+           [[ $(get_yaml.py ${train_config} etype) = custom ]] || \
+           [[ $(get_yaml.py ${train_config} dtype) = custom ]]; then
+        # Average ASR models
+        if ${use_valbest_average}; then
+            recog_model=model.val${n_average}.avg.best
+            opt="--log ${expdir}/results/log"
+        else
+            recog_model=model.last${n_average}.avg.best
+            opt="--log"
+        fi
+        average_checkpoints.py \
+            ${opt} \
+            --backend ${backend} \
+            --snapshots ${expdir}/results/snapshot.ep.* \
+            --out ${expdir}/results/${recog_model} \
+            --num ${n_average}
+
+        # Average LM models
+        if [ ${lm_n_average} -eq 0 ]; then
+            lang_model=rnnlm.model.best
+        else
+            if ${use_lm_valbest_average}; then
+                lang_model=rnnlm.val${lm_n_average}.avg.best
+                opt="--log ${lmexpdir}/log"
+            else
+                lang_model=rnnlm.last${lm_n_average}.avg.best
+                opt="--log"
+            fi
+            average_checkpoints.py \
+                ${opt} \
+                --backend ${backend} \
+                --snapshots ${lmexpdir}/snapshot.ep.* \
+                --out ${lmexpdir}/${lang_model} \
+                --num ${lm_n_average}
+        fi
+    fi
+
+    pids=() # initialize pids
+    for rtask in ${recog_set}; do
+    (
+        decode_dir=decode_${rtask}_${recog_model}_$(basename ${decode_config%.*})_${lmtag}
+        feat_recog_dir=${dumpdir}/${rtask}/delta${do_delta}
+
+        # split data
+        splitjson.py --parts ${nj} ${feat_recog_dir}/data_${bpemode}${nbpe}.json
+
+        #### use CPU for decoding
+        ngpu=0
+
+        # set batchsize 0 to disable batch decoding
+        ${decode_cmd} JOB=1:${nj} ${expdir}/${decode_dir}/log/decode.JOB.log \
+            asr_recog.py \
+            --config ${decode_config} \
+            --ngpu ${ngpu} \
+            --backend ${backend} \
+            --batchsize 0 \
+            --recog-json ${feat_recog_dir}/split${nj}utt/data_${bpemode}${nbpe}.JOB.json \
+            --result-label ${expdir}/${decode_dir}/data.JOB.json \
+            --model ${expdir}/results/${recog_model}  \
+            --rnnlm ${lmexpdir}/${lang_model} \
+            --api v2
+
+        score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel}.model --wer true ${expdir}/${decode_dir} ${dict}
+
+    ) &
+    pids+=($!) # store background pids
+    done
+    i=0; for pid in "${pids[@]}"; do wait ${pid} || ((++i)); done
+    [ ${i} -gt 0 ] && echo "$0: ${i} background jobs are failed." && false
+    echo "Finished"
+fi
+
+exit 0
diff --git a/egs/lrs2/asr1/steps b/egs/lrs2/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs/lrs2/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs/lrs2/asr1/utils b/egs/lrs2/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs/lrs2/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs/mgb2/asr1/local/xml2stm.py b/egs/mgb2/asr1/local/xml2stm.py
index 49859016a10..8bb29f457d2 100644
--- a/egs/mgb2/asr1/local/xml2stm.py
+++ b/egs/mgb2/asr1/local/xml2stm.py
@@ -11,12 +11,12 @@
 import sys
 from xml.dom import minidom
 
-_unicode = u'\u0622\u0624\u0626\u0628\u062a\u062c\u06af\u062e\u0630" \
+_unicode = '\u0622\u0624\u0626\u0628\u062a\u062c\u06af\u062e\u0630" \
 u0632\u0634\u0636\u0638\u063a\u0640\u0642\u0644\u0646\u0648\u064a\u064c" \
 u064e\u0650\u0652\u0670\u067e\u0686\u0621\u0623\u0625\u06a4\u0627\u0629" \
 u062b\u062d\u062f\u0631\u0633\u0635\u0637\u0639\u0641\u0643\u0645\u0647" \
 u0649\u064b\u064d\u064f\u0651\u0671'
-_buckwalter = u"|&}btjGx*z$DZg_qlnwyNaio`PJ'><VApvHdrsSTEfkmhYFKu~{"
+_buckwalter = "|&}btjGx*z$DZg_qlnwyNaio`PJ'><VApvHdrsSTEfkmhYFKu~{"
 
 _forwardMap = {ord(a): b for a, b in zip(_unicode, _buckwalter)}
 _backwardMap = {ord(b): a for a, b in zip(_unicode, _buckwalter)}
diff --git a/egs2/README.md b/egs2/README.md
index f11197b521a..f4429367fa7 100755
--- a/egs2/README.md
+++ b/egs2/README.md
@@ -6,85 +6,108 @@ See: https://espnet.github.io/espnet/espnet2_tutorial.html#recipes-using-espnet2
 
 ## Overview of example information
 
-| Directory name          | Corpus name                                                                             | Task                    | Language              | URL                                                                                                          | Note         |
-| ----------------------- | --------------------------------------------------------------------------------------- | ----------------------- | --------------------- | ------------------------------------------------------------------------------------------------------------ | ------------ |
-| aidatatang_200zh        | Aidatatang_200zh A free Chinese Mandarin speech corpus                                  | ASR                     | ZH                    | http://www.openslr.org/resources/62                                                                          |              |
-| aishell                 | AISHELL-ASR0009-OS1 Open Source Mandarin Speech Corpus                                  | ASR                     | ZH                    | http://www.aishelltech.com/kysjcp                                                                            |              |
-| aishell3                | AISHELL3 Mandarin multi-speaker text-to-speech                                          | TTS                     | ZH                    | https://www.openslr.org/93/                                                                                  |              |
-| ami                     | The AMI Meeting Corpus                                                                  | ASR                     | EN                    | http://groups.inf.ed.ac.uk/ami/corpus/                                                                       |              |
-| an4                     | CMU AN4 database                                                                        | ASR/TTS                 | EN                    | http://www.speech.cs.cmu.edu/databases/an4/                                                                  |              |
-| babel                   | IARPA Babel corups                                                                      | ASR                     | ~20 languages         | https://www.iarpa.gov/index.php/research-programs/babel                                                      |              |
-| catslu               	  | CATSLU-MAPS                                                                             | SLU                     | ZH           	      | https://sites.google.com/view/catslu/home                                                                    |              |
-| chime4                  | The 4th CHiME Speech Separation and Recognition Challenge                               | ASR/Multichannel ASR    | EN                    | http://spandh.dcs.shef.ac.uk/chime_challenge/chime2016/                                                      |              |
-| cmu_indic               | CMU INDIC                                                                               | TTS                     | 7 languages           | http://festvox.org/cmu_indic/                                                                                |              |
-| commonvoice             | The Mozilla Common Voice                                                                | ASR                     | 13 languages          | https://voice.mozilla.org/datasets                                                                           |              |
-| csj                     | Corpus of Spontaneous Japanese                                                          | ASR                     | JP                    | https://pj.ninjal.ac.jp/corpus_center/csj/en/                                                                |              |
-| csmsc                   | Chinese Standard Mandarin Speech Copus                                                  | TTS                     | ZH                    | https://www.data-baker.com/open_source.html                                                                  |              |
-| css10                   | CSS10: A Collection of Single Speaker Speech Datasets for 10 Languages                  | TTS                     | 10 langauges          | https://github.com/Kyubyong/css10                                                                            |              |
-| dirha_wsj               | Distant-speech Interaction for Robust Home Applications                                 | Multichannel ASR        | EN                    | https://dirha.fbk.eu/, https://github.com/SHINE-FBK/DIRHA_English_wsj                                        |              |
-| dns_ins20               | Deep Noise Suppression Challenge – INTERSPEECH 2020                                     | SE                      | 7 languages + singing | https://www.microsoft.com/en-us/research/academic-program/deep-noise-suppression-challenge-interspeech-2020/ |              |
-| fsc                     | Fluent Speech Commands Dataset                                                          | SLU                     | EN                    | https://fluent.ai/fluent-speech-commands-a-dataset-for-spoken-language-understanding-research/               |              |
-| fsc_unseen              | Fluent Speech Commands Dataset MASE Eval Unseen splits                                         | SLU                     | EN                    | https://github.com/maseEval/mase                                                                             |              |
-| fsc_challenge           | Fluent Speech Commands Dataset MASE Eval Challenge splits                                         | SLU                     | EN                    | https://github.com/maseEval/mase                                                                             |              |
-| gigaspeech              | GigaSpeech: An Evolving, Multi-domain ASR Corpus with 10,000 Hours of Transcribed Audio | ASR                     | EN                    | https://github.com/SpeechColab/GigaSpeech                                                                    |              |
-| grabo                   | Grabo dataset                                                                           |SLU                     | EN + NL               | https://www.esat.kuleuven.be/psi/spraak/downloads/                                                                    |               |
-| hkust                   | HKUST/MTS: A very large scale Mandarin telephone speech corpus                          | ASR                     | ZH                    | https://catalog.ldc.upenn.edu/LDC2005S15                                                                     |              |
-| hui_acg                 | HUI-audio-corpus-german                                                                 | TTS                     | DE                    | https://opendata.iisys.de/datasets.html#hui-audio-corpus-german                                              |              |
-| how2                    | How2: A Large-scale Dataset for Multimodal Language Understanding                       | ASR/MT/ST               | EN->PT                | https://github.com/srvk/how2-dataset                                                                         |              |
-| iemocap                 | IEMOCAP database: The Interactive Emotional Dyadic Motion Capture database              | SLU                     | EN                    | https://sail.usc.edu/iemocap/                                   |                    |
-| iwslt21_low_resource    | ALFFA, IARPA Babel, Gamayun, IWSLT 2021                                                 | ASR                     | SW                    | http://www.openslr.org/25/ https://catalog.ldc.upenn.edu/LDC2017S05 https://gamayun.translatorswb.org/data/ https://iwslt.org/2021/low-resource |              |
-| jdcinal                 | Japanese Dialogue Corpus of Information Navigation and Attentive Listening Annotated with Extended ISO-24617-2 Dialogue Act Tags              | SLU               | JP                    | http://www.lrec-conf.org/proceedings/lrec2018/pdf/464.pdf http://tts.speech.cs.cmu.edu/awb/infomation_navigation_and_attentive_listening_0.2.zip |              |
-| jkac                    | J-KAC: Japanese Kamishibai and audiobook corpus                                         | TTS                     | JP                    | https://sites.google.com/site/shinnosuketakamichi/research-topics/j-kac_corpus                               |              |
-| jmd                     | JMD: Japanese multi-dialect corpus for speech synthesis                                 | TTS                     | JP                    | https://sites.google.com/site/shinnosuketakamichi/research-topics/jmd_corpus                                 |              |
-| jsss                    | JSSS: Japanese speech corpus for summarization and simplification                       | TTS                     | JP                    | https://sites.google.com/site/shinnosuketakamichi/research-topics/jsss_corpus                                |              |
-| jsut                    | Japanese speech corpus of Saruwatari-lab., University of Tokyo                          | ASR/TTS                 | JP                    | https://sites.google.com/site/shinnosuketakamichi/publication/jsut                                           |              |
-| jtubespeech             | Japanese YouTube Speech corpus                                                          | ASR/TTS                 | JP                    |                                                                                                              |              |
-| jv_openslr35            | Javanese                                                                                | ASR                     | JV                    | http://www.openslr.org/35                                                                                    |              |
-| jvs                     | JVS (Japanese versatile speech) corpus                                                  | TTS                     | JP                    | https://sites.google.com/site/shinnosuketakamichi/research-topics/jvs_corpus                                 |              |
-| ksponspeech             | KsponSpeech (Korean spontaneous speech) corpus                                          | ASR                     | KR                    | https://aihub.or.kr/aidata/105                                                                               |              |
-| kss                     | Korean single speaker corpus                                                            | TTS                     | KR                    | https://www.kaggle.com/bryanpark/korean-single-speaker-speech-dataset                                        |              |
-| laborotv                | LaboroTVSpeech (A large-scale Japanese speech corpus on TV recordings)                  | ASR                     | JP                    | https://laboro.ai/column/eg-laboro-tv-corpus-jp                                                              |              |
-| librimix                | LibriMix: An Open-Source Dataset for Generalizable Speech Separation                    | SE                      | EN                    | https://github.com/JorisCos/LibriMix                                                                         |              |
-| librispeech             | LibriSpeech ASR corpus                                                                  | ASR                     | EN                    | http://www.openslr.org/12                                                                                    |              |
-| libritts                | LibriTTS corpus                                                                         | TTS                     | EN                    | http://www.openslr.org/60                                                                                    |              |
-| ljspeech                | The LJ Speech Dataset                                                                   | TTS                     | EN                    | https://keithito.com/LJ-Speech-Dataset/                                                                      |              |
-| lrs2                    | The Oxford-BBC Lip Reading Sentences 2 (LRS2) Dataset                                   | Lipreading/ASR          | EN                    | https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html                                                  |              |
-| mini_an4                | Mini version of CMU AN4 database for the integration test                               | ASR/TTS/SE              | EN                    | http://www.speech.cs.cmu.edu/databases/an4/                                                                  |              |
-| mini_librispeech        | Mini version of Librispeech corpus                                                      | DIAR                    | EN                    | https://openslr.org/31/                                                                                      |              |
-| mls                     | MLS (A large multilingual corpus derived from LibriVox audiobooks)                      | ASR                     | 8 languages           | http://www.openslr.org/94/                                                                                   |              |
-| nsc                     | National Speech Corpus                                                                  | ASR                     | EN-SG                 | https://www.imda.gov.sg/programme-listing/digital-services-lab/national-speech-corpus                        |              |
-| open_li52               | Corpus combination with 52 languages(Commonvocie + voxforge)                            | Multilingual ASR        | 52 languages          |                                                                                                              |              |
-| polyphone_swiss_french  | Swiss French Polyphone corpus                                                           | ASR                     | FR                    | http://catalog.elra.info/en-us/repository/browse/ELRA-S0030_02                                               |              |
-| primewords_chinese | Primewords Chinese Corpus Set 1 | ASR | ZH | https://www.openslr.org/47/ |
-| puebla_nahuatl          | Highland Puebla Nahuatl corpus (endangered language in central Mexico)                  | ASR                     | HPN                   | https://www.openslr.org/92/                                                                                  |              |
-| reverb                  | REVERB (REverberant Voice Enhancement and Recognition Benchmark) challenge              | ASR                     | EN                    | https://reverb2014.dereverberation.com/                                                                      |              |
-| ru_open_stt             | Russian Open Speech To Text (STT/ASR) Dataset                                           | ASR                     | RU                    | https://github.com/snakers4/open_stt                                                                         |              |
-| ruslan                  | RUSLAN: Russian Spoken Language Corpus For Speech Synthesis                             | TTS                     | RU                    | https://ruslan-corpus.github.io/                                                                             |              |
-| snips                   | SNIPS: A dataset for spoken language understanding | SLU                      | EN                    | https://github.com/sonos/spoken-language-understanding-research-datasets                                                                             |              |
-| siwis                   | SIWIS: Spoken Interaction with Interpretation in Switzerland                            | TTS                     | FR                    | https://https://datashare.ed.ac.uk/handle/10283/2353                                                         |              |
-| slurp                   | SLURP: A Spoken Language Understanding Resource Package                                 | SLU                     | EN                    | https://github.com/pswietojanski/slurp              |              |
-| slurp                   | SLURP: A Spoken Language Understanding Resource Package                                 | SLU/Entity Classification| EN                    | https://github.com/pswietojanski/slurp              |              |
-| sms_wsj                 | SMS-WSJ: A database for in-depth analysis of multi-channel source separation algorithms | SE                      | EN                    | https://github.com/fgnt/sms_wsj                                                                              |              |
-| speechcommands          | Speech Commands: A Dataset for Limited-Vocabulary Speech Recognition                    |           SLU                     | EN                    | https://www.tensorflow.org/datasets/catalog/speech_commands                                                                |              |
-| spgispeech              | SPGISpeech 5k corpus                                                                    | ASR                     | EN                    | https://datasets.kensho.com/datasets/scribe                                                                  |              |
-| su_openslr36            | Sundanese                                                                               | ASR                     | SU                    | http://www.openslr.org/36                                                                                    |              |
-| swbd                    | Switchboard Corpus for 2-channel Conversational Telephone Speech (300h)                 | ASR                     | EN                    | https://catalog.ldc.upenn.edu/LDC97S62                                                                       |              |
-| swbd_da                 | NXT Switchboard Annotations                                                             | SLU                     | EN                    | https://catalog.ldc.upenn.edu/LDC2009T26                                                                     |              |
-| tedlium2                | TED-LIUM corpus release 2                                    | ASR                                        | EN             | https://www.openslr.org/19/, http://www.lrec-conf.org/proceedings/lrec2014/pdf/1104_Paper.pdf |                               |
-| thchs30                 | A Free Chinese Speech Corpus Released by CSLT@Tsinghua University                       | TTS                     | ZH                    | https://www.openslr.org/18/                                                                                  |              |
-| timit                   | TIMIT Acoustic-Phonetic Continuous Speech Corpus                                        | ASR                     | EN                    | https://catalog.ldc.upenn.edu/LDC93S1                                                                        |              |
-| timit                   | Totonac corpus (endangered language in central Mexico)                                  | ASR                     | Totonac               | http://www.openslr.org/107/                                                                                  |              |
-| tsukuyomi               | つくよみちゃんコーパス                                                                  | tTS                     | JP                    | https://tyc.rei-yumesaki.net/material/corpus                                                                 |              |
-| vctk                    | English Multi-speaker Corpus for CSTR Voice Cloning Toolkit                             | TTS                     | EN                    | http://www.udialogue.org/download/cstr-vctk-corpus.html                                                      |              |
-| vctk_noisyreverb        | Noisy reverberant speech database (48kHz)                                               | SE                      | EN                    | https://datashare.ed.ac.uk/handle/10283/2826                                                                 |              |
-| vivos                   | VIVOS (Vietnamese corpus for ASR)                                                       | ASR                     | VI                    | https://ailab.hcmus.edu.vn/vivos/                                                                            |              |
-| voxforge                | VoxForge                                                                                | ASR                     | 7 languages           | http://www.voxforge.org/                                                                                     |              |
-| wenetspeech             | WenetSpeech: A 10000+ Hours Multi-domain Chinese Corpus for Speech Recognition          | ASR                     | ZH                    | https://wenet-e2e.github.io/WenetSpeech/
-| wham                    | The WSJ0 Hipster Ambient Mixtures (WHAM!) dataset                                       | SE                      | EN                    | https://wham.whisper.ai/                                                                                     |              |
-| whamr                   | WHAMR!: Noisy and Reverberant Single-Channel Speech Separation                          | SE                      | EN                    | https://wham.whisper.ai/                                                                                     |              |
-| wsj                     | CSR-I (WSJ0) Complete, CSR-II (WSJ1) Complete                                           | ASR                     | EN                    | https://catalog.ldc.upenn.edu/LDC93S6A,https://catalog.ldc.upenn.edu/LDC94S13A                               |              |
-| wsj0_2mix               | MERL WSJ0-mix multi-speaker dataset                                                     | ASR/SE                  | EN                    | http://www.merl.com/demos/deep-clustering                                                                    |              |
-| wsj0_2mix_spatialized   | MERL WSJ0-mix multi-speaker dataset (Spatialized version)                               | ASR/Multichannel ASR/SE | EN                    | http://www.merl.com/demos/deep-clustering                                                                    |              |
-| yesno                   | The "yesno" corpus                                                                      | ASR                     | HE                    | http://www.openslr.org/1                                                                                     |              |
-| yesno                   | Yoloxochitl-Mixtec corpus (endangered language in central Mexico)                       | ASR                     | Yoloxochil-Mixtec     | http://www.openslr.org/89                                                                                    |
-| zeroth_korean           | Zeroth-Korean                                                                           | ASR                     | KR                    | http://www.openslr.org/40                                                                                    |              |
+| Directory name          | Corpus name                                                                                                                      | Task                    | Language              | URL                                                                                                          | Note         |
+|-------------------------|----------------------------------------------------------------------------------------------------------------------------------|-------------------------| --------------------- | ------------------------------------------------------------------------------------------------------------ | ------------ |
+| aidatatang_200zh        | Aidatatang_200zh A free Chinese Mandarin speech corpus                                                                           | ASR                     | CMN                   | http://www.openslr.org/resources/62                                                                          |              |
+| aishell                 | AISHELL-ASR0009-OS1 Open Source Mandarin Speech Corpus                                                                           | ASR                     | CMN                   | http://www.aishelltech.com/kysjcp                                                                            |              |
+| aishell3                | AISHELL3 Mandarin multi-speaker text-to-speech                                                                                   | TTS                     | CMN                   | https://www.openslr.org/93/                                                                                  |              |
+| aishell4                | AISHELL4 Open Source Mandarin Speech Corpus in Conference Scenario                                                               | ASR/SE                  | CMN                   | https://www.openslr.org/111/                                                                                 |              |
+| ami                     | The AMI Meeting Corpus                                                                                                           | ASR                     | ENG                   | http://groups.inf.ed.ac.uk/ami/corpus/                                                                       |              |
+| an4                     | CMU AN4 database                                                                                                                 | ASR/TTS                 | ENG                   | http://www.speech.cs.cmu.edu/databases/an4/                                                                  |              |
+| babel                   | IARPA Babel corups                                                                                                               | ASR                     | ~20 languages         | https://www.iarpa.gov/index.php/research-programs/babel                                                      |              |
+| bn_openslr53            | Large bengali ASR training dataset                                                                                               | ASR                     | BEN                   | https://openslr.org/53/                                                                                      |              |
+| bur_openslr80           | Burmese ASR training dataset                                                                                                     | ASR                     | BUR                   | https://openslr.org/80/                                                                                      |              |
+| catslu               	  | CATSLU-MAPS                                                                                                                      | SLU                     | CMN           	      | https://sites.google.com/view/catslu/home                                                                     |              |
+| chime4                  | The 4th CHiME Speech Separation and Recognition Challenge                                                                        | ASR/Multichannel ASR    | ENG                   | http://spandh.dcs.shef.ac.uk/chime_challenge/chime2016/                                                      |              |
+| chime6                  | The 6th CHiME Speech Separation and Recognition Challenge    | ASR                                        | ENG             | https://chimechallenge.github.io/chime6/                |                               |
+| clarity21               | The First Clarity Enhancement Challenge CEC1                                                                                     | SE                      | ENG                   | https://claritychallenge.github.io/clarity_CEC1_doc/                                                         |              |
+| cmu_indic               | CMU INDIC                                                                                                                        | TTS                     | 7 languages           | http://festvox.org/cmu_indic/                                                                                |              |
+| commonvoice             | The Mozilla Common Voice                                                                                                         | ASR                     | 13 languages          | https://voice.mozilla.org/datasets                                                                           |              |
+| conferencingspeech21    | Far-field Multi-channel Speech Enhancement Challenge for Video Conferencing (ConferencingSpeech 2021)                            | SE                      | ENG, CMN              | https://tea-lab.qq.com/conferencingspeech-2021                                                               |              |
+| covost2                 | Multilingual speech-to-text translation corpus from Common Voice                                                                 | ST                      | lang pairs from 22    | https://github.com/facebookresearch/covost                                                                   |              |
+| csj                     | Corpus of Spontaneous Japanese                                                                                                   | ASR                     | JPN                   | https://pj.ninjal.ac.jp/corpus_center/csj/en/                                                                |              |
+| csmsc                   | Chinese Standard Mandarin Speech Copus                                                                                           | TTS                     | CMN                   | https://www.data-baker.com/open_source.html                                                                  |              |
+| css10                   | CSS10: A Collection of Single Speaker Speech Datasets for 10 Languages                                                           | TTS                     | 10 langauges          | https://github.com/Kyubyong/css10                                                                            |              |
+| dirha_wsj               | Distant-speech Interaction for Robust Home Applications                                                                          | Multichannel ASR        | ENG                   | https://dirha.fbk.eu/, https://github.com/SHINE-FBK/DIRHA_English_wsj                                        |              |
+| dns_ins20               | Deep Noise Suppression Challenge – INTERSPEECH 2020                                                                          | SE                      | 7 languages +singing  | https://www.microsoft.com/en-us/research/academic-program/deep-noise-suppression-challenge-interspeech-2020/ |              |
+| dns_icassp21            | Deep Noise Suppression Challenge – ICASSP 2021                                                                               | SE                      | 11 languages + singing| https://www.microsoft.com/en-us/research/academic-program/deep-noise-suppression-challenge-icassp-2021/      |              |
+| dns_ins21               | Deep Noise Suppression Challenge – INTERSPEECH 2021                                                                          | SE                      | 11 languages + singing| https://www.microsoft.com/en-us/research/academic-program/deep-noise-suppression-challenge-interspeech-2021/ |              |
+| dsing                   | Automatic Lyric Transcription from Karaoke Vocal Tracks (From DAMP Sing300x30x2)                                                 | ASR (ALT)               | ENG singing           | https://github.com/groadabike/Kaldi-Dsing-task                                                               |              |
+| fisher_callhome_spanish | Fisher and CALLHOME Spanish--English Speech Translation                                                                          | ASR/ST                  | SPA->ENG              | https://catalog.ldc.upenn.edu/LDC2014T23                                                                     |              |
+| fsc                     | Fluent Speech Commands Dataset                                                                                                   | SLU                     | ENG                   | https://fluent.ai/fluent-speech-commands-a-dataset-for-spoken-language-understanding-research/               |              |
+| fsc_unseen              | Fluent Speech Commands Dataset MASE Eval Unseen splits                                                                           | SLU                     | ENG                   | https://github.com/maseEval/mase                                                                             |              |
+| fsc_challenge           | Fluent Speech Commands Dataset MASE Eval Challenge splits                                                                        | SLU                     | ENG                   | https://github.com/maseEval/mase                                                                             |              |
+| gigaspeech              | GigaSpeech: An Evolving, Multi-domain ASR Corpus with 10,000 Hours of Transcribed Audio                                          | ASR                     | ENG                   | https://github.com/SpeechColab/GigaSpeech                                                                    |              |
+| grabo                   | Grabo dataset                                                                                                                    | SLU                     | ENG + NLD             | https://www.esat.kuleuven.be/psi/spraak/downloads/                                                           |              |
+| harpervalley             | HarperValleyBank: A Domain-Specific Spoken Dialog Corpus                                                                            | SLU                     | ENG                   | https://github.com/cricketclub/gridspace-stanford-harper-valley                                                       |              |
+| hkust                   | HKUST/MTS: A very large scale Mandarin telephone speech corpus                                                                   | ASR                     | CMN                   | https://catalog.ldc.upenn.edu/LDC2005S15                                                                     |              |
+| hui_acg                 | HUI-audio-corpus-german                                                                                                          | TTS                     | DEU                   | https://opendata.iisys.de/datasets.html#hui-audio-corpus-german                                              |              |
+| how2                    | How2: A Large-scale Dataset for Multimodal Language Understanding                                                                | ASR/MT/ST               | ENG->POR              | https://github.com/srvk/how2-dataset                                                                         |              |
+| iemocap                 | IEMOCAP database: The Interactive Emotional Dyadic Motion Capture database                                                       | SLU                     | ENG                   | https://sail.usc.edu/iemocap/                                                                                |              |
+| iwslt21_low_resource    | ALFFA, IARPA Babel, Gamayun, IWSLT 2021                                                                                          | ASR                     | SWA                   | http://www.openslr.org/25/ https://catalog.ldc.upenn.edu/LDC2017S05 https://gamayun.translatorswb.org/data/ https://iwslt.org/2021/low-resource |              |
+| jdcinal                 | Japanese Dialogue Corpus of Information Navigation and Attentive Listening Annotated with Extended ISO-24617-2 Dialogue Act Tags | SLU                     | JPN                   | http://www.lrec-conf.org/proceedings/lrec2018/pdf/464.pdf http://tts.speech.cs.cmu.edu/awb/infomation_navigation_and_attentive_listening_0.2.zip |              |
+| jkac                    | J-KAC: Japanese Kamishibai and audiobook corpus                                                                                  | TTS                     | JPN                  | https://sites.google.com/site/shinnosuketakamichi/research-topics/j-kac_corpus                               |              |
+| jmd                     | JMD: Japanese multi-dialect corpus for speech synthesis                                                                          | TTS                     | JPN                  | https://sites.google.com/site/shinnosuketakamichi/research-topics/jmd_corpus                                 |              |
+| jsss                    | JSSS: Japanese speech corpus for summarization and simplification                                                                | TTS                     | JPN                  | https://sites.google.com/site/shinnosuketakamichi/research-topics/jsss_corpus                                |              |
+| jsut                    | Japanese speech corpus of Saruwatari-lab., University of Tokyo                                                                   | ASR/TTS                 | JPN                  | https://sites.google.com/site/shinnosuketakamichi/publication/jsut                                           |              |
+| jtubespeech             | Japanese YouTube Speech corpus                                                                                                   | ASR/TTS                 | JPN                  |                                                                                                             |              |
+| jv_openslr35            | Javanese                                                                                                                         | ASR                     | JAV                  | http://www.openslr.org/35                                                                                    |              |
+| jvs                     | JVS (Japanese versatile speech) corpus                                                                                           | TTS                     | JPN                  | https://sites.google.com/site/shinnosuketakamichi/research-topics/jvs_corpus                                 |              |
+| ksponspeech             | KsponSpeech (Korean spontaneous speech) corpus                                                                                   | ASR                     | KOR                  | https://aihub.or.kr/aidata/105                                                                               |              |
+| kss                     | Korean single speaker corpus                                                                                                     | TTS                     | KOR                  | https://www.kaggle.com/bryanpark/korean-single-speaker-speech-dataset                                        |              |
+| laborotv                | LaboroTVSpeech (A large-scale Japanese speech corpus on TV recordings)                                                           | ASR                     | JPN                  | https://laboro.ai/column/eg-laboro-tv-corpus-jp                                                              |              |
+| librimix                | LibriMix: An Open-Source Dataset for Generalizable Speech Separation                                                             | SE                      | ENG                  | https://github.com/JorisCos/LibriMix                                                                         |              |
+| librispeech             | LibriSpeech ASR corpus                                                                                                           | ASR                     | ENG                  | http://www.openslr.org/12                                                                                    |              |
+| librispeech_100         | LibriSpeech ASR corpus 100h subset                                                                                               | ASR                     | ENG                  | http://www.openslr.org/12                                                                                    |              |
+| libritts                | LibriTTS corpus                                                                                                                  | TTS                     | ENG                  | http://www.openslr.org/60                                                                                    |              |
+| ljspeech                | The LJ Speech Dataset                                                                                                            | TTS                     | ENG                  | https://keithito.com/LJ-Speech-Dataset/                                                                      |              |
+| lrs3                    | The Oxford-BBC Lip Reading Sentences 3 (LRS3) Dataset                                                                            | ASR                     | ENG                  | https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs3.html                                                  |              |
+| lrs2                    | The Oxford-BBC Lip Reading Sentences 2 (LRS2) Dataset                                                                            | Lipreading/ASR          | ENG                  | https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html                                                  |              |
+| mediaspeech             | MediaSpeech: Multilanguage ASR Benchmark and Dataset                                    | ASR                     | FRA                  | https://www.openslr.org/108/                                                                           |              |
+| microsoft_speech        | Microsoft Speech Corpus (Indian languages)                                                                                       | ASR                     | 3 languages          | https://msropendata.com/datasets/7230b4b1-912d-400e-be58-f84e0512985e                                        |              |
+| mini_an4                | Mini version of CMU AN4 database for the integration test                                                                        | ASR/TTS/SE              | ENG                  | http://www.speech.cs.cmu.edu/databases/an4/                                                                  |              |
+| mini_librispeech        | Mini version of Librispeech corpus                                                                                               | DIAR                    | ENG                  | https://openslr.org/31/                                                                                      |              |
+| ml_openslr63            | Crowdsourced high-quality Malayalam multi-speaker speech data                                                                    | ASR                     | MAL                  | https://openslr.org/63/                                                                                      |              |
+| mls                     | MLS (A large multilingual corpus derived from LibriVox audiobooks)                                                               | ASR                     | 8 languages          | http://www.openslr.org/94/                                                                                   |              |
+| mr_openslr64            | OpenSLR Marathi Corpus                                                                                                           | ASR                     | MAR                  | http://www.openslr.org/64/                                                                                   |              |
+| ms_indic_is18           | Microsoft Speech Corpus (Indian languages)                                                                                       | ASR                     | 3 langs: TEL TAM GUJ | https://msropendata.com/datasets/7230b4b1-912d-400e-be58-f84e0512985e                                        |              |
+| nsc                     | National Speech Corpus                                                                                                           | ASR                     | ENG-SG               | https://www.imda.gov.sg/programme-listing/digital-services-lab/national-speech-corpus                        |              |
+| open_li52               | Corpus combination with 52 languages(Commonvocie + voxforge)                                                                     | Multilingual ASR        | 52 languages         |                                                                                                              |              |
+| polyphone_swiss_french  | Swiss French Polyphone corpus                                                                                                    | ASR                     | FRA                  | http://catalog.elra.info/en-us/repository/browse/ELRA-S0030_02                                               |              |
+| primewords_chinese      | Primewords Chinese Corpus Set 1                                                                                                  | ASR                     | CMN                  | https://www.openslr.org/47/                                                                                  |              |
+| puebla_nahuatl          | Highland Puebla Nahuatl corpus (endangered language in central Mexico)                                                           | ASR                     | HPN                  | https://www.openslr.org/92/                                                                                  |              |
+| reverb                  | REVERB (REverberant Voice Enhancement and Recognition Benchmark) challenge                                                       | ASR                     | ENG                  | https://reverb2014.dereverberation.com/                                                                      |              |
+| ru_open_stt             | Russian Open Speech To Text (STT/ASR) Dataset                                                                                    | ASR                     | RUS                  | https://github.com/snakers4/open_stt                                                                         |              |
+| ruslan                  | RUSLAN: Russian Spoken Language Corpus For Speech Synthesis                                                                      | TTS                     | RUS                  | https://ruslan-corpus.github.io/                                                                             |              |
+| snips                   | SNIPS: A dataset for spoken language understanding                                                                               | SLU                     | ENG                  | https://github.com/sonos/spoken-language-understanding-research-datasets                                     |              |
+| seame                   | SEAME: a Mandarin-English Code-switching Speech Corpus in South-East Asia                                                        | ASR                     | ENG + CMN            | https://catalog.ldc.upenn.edu/LDC2015S04                                                                     |              |
+| siwis                   | SIWIS: Spoken Interaction with Interpretation in Switzerland                                                                     | TTS                     | FRA                  | https://datashare.ed.ac.uk/handle/10283/2353                                                                 |              |
+| slue-voxceleb           | SLUE: Spoken Language Understanding Evaluation                                                                                   | SLU                     | ENG                  | https://github.com/asappresearch/slue-toolkit                                                                |              |
+| slurp                   | SLURP: A Spoken Language Understanding Resource Package                                                                          | SLU                     | ENG                  | https://github.com/pswietojanski/slurp                                                                       |              |
+| slurp_entity            | SLURP: A Spoken Language Understanding Resource Package                                                                          | SLU/Entity Classifi.    | ENG                  | https://github.com/pswietojanski/slurp                                                                       |              |
+| sms_wsj                 | SMS-WSJ: A database for in-depth analysis of multi-channel source separation algorithms                                          | SE                      | ENG                  | https://github.com/fgnt/sms_wsj                                                                              |              |
+| speechcommands          | Speech Commands: A Dataset for Limited-Vocabulary Speech Recognition                                                             | SLU                     | ENG                  | https://www.tensorflow.org/datasets/catalog/speech_commands                                                  |              |
+| spgispeech              | SPGISpeech 5k corpus                                                                                                             | ASR                     | ENG                  | https://datasets.kensho.com/datasets/scribe                                                                  |              |
+| su_openslr36            | Sundanese                                                                                                                        | ASR                     | SUN                  | http://www.openslr.org/36                                                                                    |              |
+| swbd                    | Switchboard Corpus for 2-channel Conversational Telephone Speech (300h)                                                          | ASR                     | ENG                  | https://catalog.ldc.upenn.edu/LDC97S62                                                                       |              |
+| swbd_da                 | NXT Switchboard Annotations                                                                                                      | SLU                     | ENG                  | https://catalog.ldc.upenn.edu/LDC2009T26                                                                     |              |
+| swbd_sentiment          | Speech Sentiment Annotations                                                                                                     | SLU                     | ENG                  | https://catalog.ldc.upenn.edu/LDC2020T14                                                                     |              |
+| tedlium2                | TED-LIUM corpus release 2                                                                                                        | ASR                     | ENG                  | https://www.openslr.org/19/, http://www.lrec-conf.org/proceedings/lrec2014/pdf/1104_Paper.pdf                |              |
+| thchs30                 | A Free Chinese Speech Corpus Released by CSLT@Tsinghua University                                                                | TTS                     | CMN                  | https://www.openslr.org/18/                                                                                  |              |
+| timit                   | TIMIT Acoustic-Phonetic Continuous Speech Corpus                                                                                 | ASR                     | ENG                  | https://catalog.ldc.upenn.edu/LDC93S1                                                                        |              |
+| totonac                 | Highland Totonac corpus (endangered language in central Mexico)                                                                  | ASR                     | TOS                  | http://www.openslr.org/107/                                                                                  |              |
+| tsukuyomi               | つくよみちゃんコーパス                                                                                | TTS                     | JPN                  | https://tyc.rei-yumesaki.net/material/corpus                                                                 |              |
+| vctk                    | English Multi-speaker Corpus for CSTR Voice Cloning Toolkit                                                                      | ASR/TTS                 | ENG                  | http://www.udialogue.org/download/cstr-vctk-corpus.html                                                      |              |
+| vctk_noisyreverb        | Noisy reverberant speech database (48kHz)                                                                                        | SE                      | ENG                  | https://datashare.ed.ac.uk/handle/10283/2826                                                                 |              |
+| vivos                   | VIVOS (Vietnamese corpus for ASR)                                                                                                | ASR                     | VIE                  | https://ailab.hcmus.edu.vn/vivos/                                                                            |              |
+| voxforge                | VoxForge                                                                                                                         | ASR                     | 7 languages          | http://www.voxforge.org/                                                                                     |              |
+| wenetspeech             | WenetSpeech: A 10000+ Hours Multi-domain Chinese Corpus for Speech Recognition                                                   | ASR                     | CMN                  | https://wenet-e2e.github.io/WenetSpeech/                                                                     |              |
+| wham                    | The WSJ0 Hipster Ambient Mixtures (WHAM!) dataset                                                                                | SE                      | ENG                  | https://wham.whisper.ai/                                                                                     |              |
+| whamr                   | WHAMR!: Noisy and Reverberant Single-Channel Speech Separation                                                                   | SE                      | ENG                  | https://wham.whisper.ai/                                                                                     |              |
+| wsj                     | CSR-I (WSJ0) Complete, CSR-II (WSJ1) Complete                                                                                    | ASR                     | ENG                  | https://catalog.ldc.upenn.edu/LDC93S6A,https://catalog.ldc.upenn.edu/LDC94S13A                               |              |
+| wsj0_2mix               | MERL WSJ0-mix multi-speaker dataset                                                                                              | ASR/SE                  | ENG                  | http://www.merl.com/demos/deep-clustering                                                                    |              |
+| wsj0_2mix_spatialized   | MERL WSJ0-mix multi-speaker dataset (Spatialized version)                                                                        | ASR/Multichannel ASR/SE | ENG                  | http://www.merl.com/demos/deep-clustering                                                                    |              |
+| yesno                   | The "yesno" corpus                                                                                                               | ASR                     | HEB                  | http://www.openslr.org/1                                                                                     |              |
+| yoloxochitl_mixtec      | Yoloxochitl-Mixtec corpus (endangered language in central Mexico)                                                                | ASR                     | XTY                  | http://www.openslr.org/89                                                                                    |              |
+| zeroth_korean           | Zeroth-Korean                                                                                                                    | ASR                     | KOR                  | http://www.openslr.org/40                                                                                    |              |
+| zh_openslr38            | ST-CMDS-20170001_1, Free ST Chinese Mandarin Corpus                                                                              | ASR                     | CMN                  | http://www.openslr.org/38                                                                                    |              |
diff --git a/egs2/TEMPLATE/asr1/asr.sh b/egs2/TEMPLATE/asr1/asr.sh
index ca39d789b11..f4d7a8ad24a 100755
--- a/egs2/TEMPLATE/asr1/asr.sh
+++ b/egs2/TEMPLATE/asr1/asr.sh
@@ -110,6 +110,8 @@ k2_config=./conf/decode_asr_transformer_with_k2.yaml
 
 use_streaming=false # Whether to use streaming decoding
 
+use_maskctc=false # Whether to use maskctc decoding
+
 batch_size=1
 inference_tag=    # Suffix to the result dir for decoding.
 inference_config= # Config for decoding.
@@ -224,6 +226,7 @@ Options:
     --inference_asr_model # ASR model path for decoding (default="${inference_asr_model}").
     --download_model      # Download a model from Model Zoo and use it for decoding (default="${download_model}").
     --use_streaming       # Whether to use streaming decoding (default="${use_streaming}").
+    --use_maskctc         # Whether to use maskctc decoding (default="${use_streaming}").
 
     # [Task dependent] Set the datadir name created by local/data.sh
     --train_set     # Name of training set (required).
@@ -894,8 +897,8 @@ if ! "${skip_train}"; then
     if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
         if "${use_ngram}"; then
             log "Stage 9: Ngram Training: train_set=${data_feats}/lm_train.txt"
-            cut -f 2 -d " " ${data_feats}/lm_train.txt | lmplz -S "20%" --discount_fallback -o ${ngram_num} - >${ngram_exp}/${ngram_num}gram.arpa
-            build_binary -s ${ngram_exp}/${ngram_num}gram.arpa ${ngram_exp}/${ngram_num}gram.bin 
+            cut -f 2- -d " " ${data_feats}/lm_train.txt | lmplz -S "20%" --discount_fallback -o ${ngram_num} - >${ngram_exp}/${ngram_num}gram.arpa
+            build_binary -s ${ngram_exp}/${ngram_num}gram.arpa ${ngram_exp}/${ngram_num}gram.bin
         else
             log "Stage 9: Skip ngram stages: use_ngram=${use_ngram}"
         fi
@@ -1195,6 +1198,8 @@ if ! "${skip_eval}"; then
         else
           if "${use_streaming}"; then
               asr_inference_tool="espnet2.bin.asr_inference_streaming"
+          elif "${use_maskctc}"; then
+              asr_inference_tool="espnet2.bin.asr_inference_maskctc"
           else
               asr_inference_tool="espnet2.bin.asr_inference"
           fi
@@ -1478,8 +1483,8 @@ fi
 if ! "${skip_upload_hf}"; then
     if [ ${stage} -le 16 ] && [ ${stop_stage} -ge 16 ]; then
         [ -z "${hf_repo}" ] && \
-            log "ERROR: You need to setup the variable hf_repo with the name of the repository located at HuggingFace" && \
-            exit 1
+            log "ERROR: You need to setup the variable hf_repo with the name of the repository located at HuggingFace, follow the following steps described here https://github.com/espnet/espnet/blob/master/CONTRIBUTING.md#132-espnet2-recipes" && \
+	    exit 1
         log "Stage 16: Upload model to HuggingFace: ${hf_repo}"
 
         gitlfs=$(git lfs --version 2> /dev/null || true)
diff --git a/egs2/TEMPLATE/asr1/db.sh b/egs2/TEMPLATE/asr1/db.sh
index 660b76ec9d0..7cbc4eeb67b 100755
--- a/egs2/TEMPLATE/asr1/db.sh
+++ b/egs2/TEMPLATE/asr1/db.sh
@@ -1,16 +1,21 @@
 # Set the path of your corpus
 # "downloads" means the corpus can be downloaded by the recipe automatically
 
+ACCENTED_FR=downloads
 AIDATATANG_200ZH=downloads
 AISHELL=downloads
 AISHELL3=downloads
 AISHELL4=downloads
 ALFFA=downloads
 AN4=downloads
+AUDIOSET=
 DIRHA_ENGLISH_PHDEV=
 DIRHA_WSJ=
 DIRHA_WSJ_PROCESSED="${PWD}/data/local/dirha_wsj_processed"  # Output file path
 DNS=
+DNS2=
+DNS3=
+DSING=downloads
 WSJ0=
 WSJ1=
 WSJCAM0=
@@ -18,6 +23,7 @@ REVERB=
 REVERB_OUT="${PWD}/REVERB"  # Output file path
 CHIME3=
 CHIME4=
+CHIME5=
 CSJDATATOP=
 CSJVER=dvd  ## Set your CSJ format (dvd or usb).
             ## Usage    :
@@ -33,6 +39,7 @@ CSS10=
 HKUST1=
 HKUST2=
 HUI_ACG=downloads
+HUB4_SPANISH=
 LABOROTV=
 TEDXJP=
 LIBRISPEECH=
@@ -40,11 +47,13 @@ LIBRILIGHT_LIMITED=
 FSC=
 SLURP=
 VOXCELEB=
+MEDIASPEECH=downloads
 MINI_LIBRISPEECH=downloads
 MISP2021=
 LIBRIMIX=downloads
 LIBRITTS=
 LJSPEECH=downloads
+MUSAN=
 NSC=
 JMD=downloads
 JSSS=downloads
@@ -62,6 +71,7 @@ TSUKUYOMI=downloads
 VOXFORGE=downloads
 AMI=
 COMMONVOICE=downloads
+MICROSOFT_SPEECH_CORPUS=
 BABEL_101=
 BABEL_102=
 BABEL_103=
@@ -87,7 +97,7 @@ BABEL_401=
 BABEL_402=
 BABEL_403=
 BABEL_404=
-PUEBLA_NAHUATL=
+PUEBLA_NAHUATL=downloads
 TEDLIUM2=downloads
 TEDLIUM3=downloads
 VCTK=downloads
@@ -102,13 +112,16 @@ RU_OPEN_STT=downloads
 RUSLAN=downloads
 SIWIS=downloads
 GIGASPEECH=
+GOOGLEI18N=downloads
 NOISY_SPEECH=
 NOISY_REVERBERANT_SPEECH=
 LRS2=
+LRS3=
 SUNDA=downloads
 CMU_ARCTIC=downloads
 CMU_INDIC=downloads
 INDIC_SPEECH=downloads
+IWSLT22_DIALECT=
 JKAC=
 MUCS_SUBTASK1=downloads
 MUCS_SUBTASK2=downloads
@@ -120,9 +133,18 @@ WENETSPEECH=
 SPEECHCOMMANDS=downloads
 TOTONAC=downloads
 PRIMEWORDS_CHINESE=downloads
+SEAME=
+BENGALI=downloads
+IWSLT14=
+BURMESE=downloads
+MALAYALAM=downloads
+ST_CMDS=downloads
+MS_INDIC_IS18=
+MARATHI=downloads
+HARPERVALLEY=downloads
 
-# For only CMU environment
-if [[ "$(hostname)" == tir.lti.cs.cmu.edu ]]; then
+# For only CMU TIR environment
+if [[ "$(hostname)" == tir* ]]; then
     BABEL_101=/projects/tir5/data/speech_corpora/babel/IARPA_BABEL_BP_101/
     BABEL_102=/projects/tir5/data/speech_corpora/babel/BABEL_OP1_102/
     BABEL_103=/projects/tir5/data/speech_corpora/babel/BABEL_OP1_103/
@@ -149,7 +171,12 @@ if [[ "$(hostname)" == tir.lti.cs.cmu.edu ]]; then
     BABEL_403=/projects/tir5/data/speech_corpora/babel/IARPA-babel403b-v1.0b-build/BABEL_OP3_403/
     BABEL_404=/projects/tir5/data/speech_corpora/babel/IARPA_BABEL_OP3_404/
     GRABO=/projects/tir5/data/speech_corpora/Grabo
+    IWSLT14=/projects/tir5/data/iwslt14
+    IWSLT22_DIALECT=/projects/tir5/data/speech_corpora/LDC2022E01_IWSLT22_Tunisian_Arabic_Shared_Task_Training_Data/
     PRIMEWORDS_CHINESE=/projects/tir5/data/speech_corpora/Primewords_Chinese
+    FISHER_CALLHOME_SPANISH=/projects/tir5/data/speech_corpora/fisher_callhome_spanish
+    DSING=/projects/tir5/data/speech_corpora/sing_300x30x2
+    MS_INDIC_IS18=/projects/tir6/general/cnariset/corpora/microsoft_speech_corpus_indian_languages
 fi
 
 # For only JHU environment
@@ -185,12 +212,14 @@ if [[ "$(hostname -d)" == clsp.jhu.edu ]]; then
     HKUST1=
     HKUST2=
     HUI_ACG=downloads
+    HUB4_SPANISH=
     LABOROTV=
     TEDXJP=
     LIBRISPEECH=
     FSC=
     SNIPS= # smart-light-en-closed-field data path
     SLURP=
+    MEDIASPEECH=downloads
     MINI_LIBRISPEECH=downloads
     LIBRITTS=
     LJSPEECH=downloads
@@ -242,6 +271,7 @@ if [[ "$(hostname -d)" == clsp.jhu.edu ]]; then
     ZEROTH_KOREAN=downloads
     LRS2=
     JAVA=
+    BENGALI=
     RU_OPEN_STT=downloads
     RUSLAN=downloads
     SIWIS=downloads
@@ -254,4 +284,7 @@ if [[ "$(hostname -d)" == clsp.jhu.edu ]]; then
     GAMAYUN=downloads
     IWSLT21LR=downloads/iwslt21
     TOTONAC=downloads
+    GOOGLEI18N=downloads
+    MALAYALAM=
+
 fi
diff --git a/egs2/TEMPLATE/asr1/pyscripts/utils/convert_text_to_phn.py b/egs2/TEMPLATE/asr1/pyscripts/utils/convert_text_to_phn.py
index 21f8f4daf46..a6605409f15 100755
--- a/egs2/TEMPLATE/asr1/pyscripts/utils/convert_text_to_phn.py
+++ b/egs2/TEMPLATE/asr1/pyscripts/utils/convert_text_to_phn.py
@@ -1,15 +1,18 @@
 #!/usr/bin/env python3
 
-# Copyright 2021 Tomoki Hayashi
+# Copyright 2021 Tomoki Hayashi and Gunnar Thor
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 
 """Convert kaldi-style text into phonemized sentences."""
 
 import argparse
 import codecs
+import contextlib
 
 from joblib import delayed
 from joblib import Parallel
+from joblib import parallel
+from tqdm import tqdm
 
 from espnet2.text.cleaner import TextCleaner
 from espnet2.text.phoneme_tokenizer import PhonemeTokenizer
@@ -34,13 +37,40 @@ def main():
     text = {line.split()[0]: " ".join(line.split()[1:]) for line in lines}
     if cleaner is not None:
         text = {k: cleaner(v) for k, v in text.items()}
-    phns_list = Parallel(n_jobs=args.nj)(
-        [delayed(phoneme_tokenizer.text2tokens)(sentence) for sentence in text.values()]
-    )
+    with tqdm_joblib(tqdm(total=len(text.values()), desc="Phonemizing")):
+        phns_list = Parallel(n_jobs=args.nj)(
+            [
+                delayed(phoneme_tokenizer.text2tokens)(sentence)
+                for sentence in text.values()
+            ]
+        )
     with codecs.open(args.out_text, "w", encoding="utf8") as g:
         for utt_id, phns in zip(text.keys(), phns_list):
             g.write(f"{utt_id} " + " ".join(phns) + "\n")
 
 
+@contextlib.contextmanager
+def tqdm_joblib(tqdm_object):
+    """Patch joblib to report into tqdm progress bar given as argument.
+
+    Reference:
+        https://stackoverflow.com/questions/24983493
+
+    """
+
+    class TqdmBatchCompletionCallback(parallel.BatchCompletionCallBack):
+        def __call__(self, *args, **kwargs):
+            tqdm_object.update(n=self.batch_size)
+            return super().__call__(*args, **kwargs)
+
+    old_batch_callback = parallel.BatchCompletionCallBack
+    parallel.BatchCompletionCallBack = TqdmBatchCompletionCallback
+    try:
+        yield tqdm_object
+    finally:
+        parallel.BatchCompletionCallBack = old_batch_callback
+        tqdm_object.close()
+
+
 if __name__ == "__main__":
     main()
diff --git a/egs2/TEMPLATE/asr1/pyscripts/utils/extract_xvectors.py b/egs2/TEMPLATE/asr1/pyscripts/utils/extract_xvectors.py
new file mode 100755
index 00000000000..e64b82dc515
--- /dev/null
+++ b/egs2/TEMPLATE/asr1/pyscripts/utils/extract_xvectors.py
@@ -0,0 +1,124 @@
+#!/usr/bin/env python3
+#  2022, Hitachi LTD.; Nelson Yalta
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+import argparse
+import kaldiio
+import logging
+from pathlib import Path
+import sys
+import torch
+import os
+import numpy as np
+
+from tqdm.contrib import tqdm
+
+from espnet2.fileio.sound_scp import SoundScpReader
+
+
+def get_parser():
+    """Construct the parser."""
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument("--pretrained_model", type=str, help="Pretrained model.")
+    parser.add_argument(
+        "--toolkit",
+        type=str,
+        help="Toolkit for Extracting X-vectors.",
+        choices=["espnet", "speechbrain"],
+    )
+    parser.add_argument("--verbose", type=int, default=1, help="Verbosity level.")
+    parser.add_argument("--device", type=str, default="cuda:0", help="Inference device")
+    parser.add_argument(
+        "in_folder", type=Path, help="Path to the input kaldi data directory."
+    )
+    parser.add_argument(
+        "out_folder",
+        type=Path,
+        help="Output folder to save the xvectors.",
+    )
+    return parser
+
+
+def main(argv):
+    """Load the model, generate kernel and bandpass plots."""
+    parser = get_parser()
+    args = parser.parse_args(argv)
+
+    if args.verbose > 0:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+        logging.warning("Skip DEBUG/INFO messages")
+
+    if torch.cuda.is_available() and ("cuda" in args.device):
+        device = args.device
+    else:
+        device = "cpu"
+
+    if args.toolkit == "speechbrain":
+        from speechbrain.dataio.preprocess import AudioNormalizer
+        from speechbrain.pretrained import EncoderClassifier
+
+        # Prepare spk2utt for mean x-vector
+        spk2utt = dict()
+        with open(os.path.join(args.in_folder, "spk2utt"), "r") as reader:
+            for line in reader:
+                details = line.split()
+                spk2utt[details[0]] = details[1:]
+
+        # TODO(nelson): The model inference can be moved into functon.
+        classifier = EncoderClassifier.from_hparams(
+            source=args.pretrained_model, run_opts={"device": device}
+        )
+        audio_norm = AudioNormalizer()
+
+        wav_scp = SoundScpReader(os.path.join(args.in_folder, "wav.scp"))
+        os.makedirs(args.out_folder, exist_ok=True)
+        writer_utt = kaldiio.WriteHelper(
+            "ark,scp:{0}/xvector.ark,{0}/xvector.scp".format(args.out_folder)
+        )
+        writer_spk = kaldiio.WriteHelper(
+            "ark,scp:{0}/spk_xvector.ark,{0}/spk_xvector.scp".format(args.out_folder)
+        )
+
+        for speaker in tqdm(spk2utt):
+            xvectors = list()
+            for utt in spk2utt[speaker]:
+                in_sr, wav = wav_scp[utt]
+                # Amp Normalization -1 ~ 1
+                amax = np.amax(np.absolute(wav))
+                wav = wav.astype(np.float32) / amax
+                # Freq Norm
+                wav = audio_norm(torch.from_numpy(wav), in_sr).to(device)
+                # X-vector Embedding
+                embeds = classifier.encode_batch(wav).detach().cpu().numpy()[0]
+                writer_utt[utt] = np.squeeze(embeds)
+                xvectors.append(embeds)
+
+            # Speaker Normalization
+            embeds = np.mean(np.stack(xvectors, 0), 0)
+            writer_spk[speaker] = embeds
+        writer_utt.close()
+        writer_spk.close()
+
+    elif args.toolkit == "espnet":
+        raise NotImplementedError(
+            "Follow details at: https://github.com/espnet/espnet/issues/3040"
+        )
+    else:
+        raise ValueError(
+            f"Unkown type of toolkit. Only supported: speechbrain, espnet, kaldi"
+        )
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
diff --git a/egs2/TEMPLATE/asr1/pyscripts/utils/remove_duplicate_keys.py b/egs2/TEMPLATE/asr1/pyscripts/utils/remove_duplicate_keys.py
new file mode 100644
index 00000000000..1ad763f6e21
--- /dev/null
+++ b/egs2/TEMPLATE/asr1/pyscripts/utils/remove_duplicate_keys.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+
+# Copyright 2021 Jiatong Shi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Remove duplicate index from a index file."""
+
+import argparse
+from pathlib import Path
+
+
+def read_2column_text(path):
+    """Read a text file having 2 column as dict object.
+    Examples:
+        wav.scp:
+            key1 /some/path/a.wav
+            key2 /some/path/b.wav
+        >>> read_2column_text('wav.scp')
+        {'key1': '/some/path/a.wav', 'key2': '/some/path/b.wav'}
+    """
+
+    keys = set()
+    with Path(path).open("r", encoding="utf-8") as f:
+        for linenum, line in enumerate(f, 1):
+            sps = line.rstrip().split(maxsplit=1)
+            if len(sps) == 1:
+                k, v = sps[0], ""
+            else:
+                k, v = sps
+            if k in keys:
+                continue
+            else:
+                print("{} {}".format(k, v))
+                keys.add(k)
+
+
+def main():
+    """Print the duplicate-free result in stdout."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "index_file", type=str, help="Kaldi-style utterance-indexed file path."
+    )
+    args = parser.parse_args()
+
+    read_2column_text(args.index_file)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs2/TEMPLATE/asr1/pyscripts/utils/score_intent.py b/egs2/TEMPLATE/asr1/pyscripts/utils/score_intent.py
index 13354637d52..4f0f074c9db 100755
--- a/egs2/TEMPLATE/asr1/pyscripts/utils/score_intent.py
+++ b/egs2/TEMPLATE/asr1/pyscripts/utils/score_intent.py
@@ -12,7 +12,7 @@
 import argparse
 
 
-def get_classification_result(hyp_file, ref_file):
+def get_classification_result(hyp_file, ref_file, hyp_write, ref_write):
     hyp_lines = [line for line in hyp_file]
     ref_lines = [line for line in ref_file]
 
@@ -22,6 +22,16 @@ def get_classification_result(hyp_file, ref_file):
         ref_intent = ref_lines[line_count].split(" ")[0]
         if hyp_intent != ref_intent:
             error += 1
+        hyp_write.write(
+            " ".join(hyp_lines[line_count].split("\t")[0].split(" ")[1:])
+            + "\t"
+            + hyp_lines[line_count].split("\t")[1]
+        )
+        ref_write.write(
+            " ".join(ref_lines[line_count].split("\t")[0].split(" ")[1:])
+            + "\t"
+            + ref_lines[line_count].split("\t")[1]
+        )
     return 1 - (error / len(hyp_lines))
 
 
@@ -56,7 +66,16 @@ def get_classification_result(hyp_file, ref_file):
     os.path.join(exp_root, valid_inference_folder + "score_wer/ref.trn")
 )
 
-result = get_classification_result(valid_hyp_file, valid_ref_file)
+valid_hyp_write_file = open(
+    os.path.join(exp_root, valid_inference_folder + "score_wer/hyp_asr.trn"), "w"
+)
+valid_ref_write_file = open(
+    os.path.join(exp_root, valid_inference_folder + "score_wer/ref_asr.trn"), "w"
+)
+
+result = get_classification_result(
+    valid_hyp_file, valid_ref_file, valid_hyp_write_file, valid_ref_write_file
+)
 print("Valid Intent Classification Result")
 print(result)
 
@@ -66,8 +85,16 @@ def get_classification_result(hyp_file, ref_file):
 test_ref_file = open(
     os.path.join(exp_root, test_inference_folder + "score_wer/ref.trn")
 )
+test_hyp_write_file = open(
+    os.path.join(exp_root, test_inference_folder + "score_wer/hyp_asr.trn"), "w"
+)
+test_ref_write_file = open(
+    os.path.join(exp_root, test_inference_folder + "score_wer/ref_asr.trn"), "w"
+)
 
-result = get_classification_result(test_hyp_file, test_ref_file)
+result = get_classification_result(
+    test_hyp_file, test_ref_file, test_hyp_write_file, test_ref_write_file
+)
 print("Test Intent Classification Result")
 print(result)
 
@@ -79,6 +106,17 @@ def get_classification_result(hyp_file, ref_file):
     utt_test_ref_file = open(
         os.path.join(exp_root, utt_test_inference_folder + "score_wer/ref.trn")
     )
-    result = get_classification_result(utt_test_hyp_file, utt_test_ref_file)
+    utt_test_hyp_write_file = open(
+        os.path.join(exp_root, utt_test_inference_folder + "score_wer/hyp_asr.trn"), "w"
+    )
+    utt_test_ref_write_file = open(
+        os.path.join(exp_root, utt_test_inference_folder + "score_wer/ref_asr.trn"), "w"
+    )
+    result = get_classification_result(
+        utt_test_hyp_file,
+        utt_test_ref_file,
+        utt_test_hyp_write_file,
+        utt_test_ref_write_file,
+    )
     print("Unseen Utterance Test Intent Classification Result")
     print(result)
diff --git a/egs2/TEMPLATE/asr1/pyscripts/utils/score_summarization.py b/egs2/TEMPLATE/asr1/pyscripts/utils/score_summarization.py
new file mode 100644
index 00000000000..35202f1ce88
--- /dev/null
+++ b/egs2/TEMPLATE/asr1/pyscripts/utils/score_summarization.py
@@ -0,0 +1,50 @@
+import sys
+import os
+from datasets import load_metric
+import numpy as np
+from nlgeval import compute_metrics
+from nlgeval import NLGEval
+
+
+ref_file = sys.argv[1]
+hyp_file = sys.argv[2]
+
+with open(ref_file, "r") as f:
+    ref_dict = {
+        line.strip().split(" ")[0]: " ".join(line.strip().split(" ")[1:])
+        for line in f.readlines()
+    }
+
+with open(hyp_file, "r") as f:
+    hyp_dict = {
+        line.strip().split(" ")[0]: " ".join(line.strip().split(" ")[1:])
+        for line in f.readlines()
+    }
+
+keys = [k for k, v in hyp_dict.items()]
+labels = [ref_dict[k] for k, _ in hyp_dict.items()]
+decoded_preds = [v for k, v in hyp_dict.items()]
+
+metric = load_metric("bertscore")
+result_bert = metric.compute(
+    predictions=decoded_preds,
+    references=labels,
+    lang="en",
+)
+
+
+nlg = NLGEval()  # loads the models
+print("Key", "\t", "METEOR", "\t", "ROUGE-L")
+for (key, ref, hyp) in zip(keys, labels, decoded_preds):
+    metrics_dict = nlg.compute_individual_metrics([ref], hyp)
+    print(key, "\t", metrics_dict["METEOR"], "\t", metrics_dict["ROUGE_L"])
+refs = [[x] for x in labels]
+metrics_dict = nlg.compute_metrics(ref_list=[labels], hyp_list=decoded_preds)
+metric = load_metric("rouge")
+result = metric.compute(predictions=decoded_preds, references=labels)
+result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
+
+print(
+    f"RESULT {result['rouge1']} {result['rouge2']} {result['rougeL']} \
+    {metrics_dict['METEOR']*100.0} {100*np.mean(result_bert['precision'])}"
+)
diff --git a/egs2/TEMPLATE/asr1/scripts/utils/create_README_file.py b/egs2/TEMPLATE/asr1/scripts/utils/create_README_file.py
index 05d06104137..b30c6c64b56 100644
--- a/egs2/TEMPLATE/asr1/scripts/utils/create_README_file.py
+++ b/egs2/TEMPLATE/asr1/scripts/utils/create_README_file.py
@@ -2,7 +2,7 @@
 from espnet_model_zoo.downloader import ModelDownloader
 import sys
 
-tts_reference="@inproceedings{hayashi2020espnet,\n\
+tts_reference = "@inproceedings{hayashi2020espnet,\n\
   title={{Espnet-TTS}: Unified, reproducible, and integratable open source end-to-end text-to-speech toolkit},\n\
   author={Hayashi, Tomoki and Yamamoto, Ryuichi and Inoue, Katsuki and Yoshimura, Takenori and Watanabe, Shinji and Toda, Tomoki and Takeda, Kazuya and Zhang, Yu and Tan, Xu},\n\
   booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},\n\
@@ -11,45 +11,49 @@
   organization={IEEE}\n\
 }"
 
-def create_Readme_file(repo_name,model_name):
-	# Fill in the blanks in the template Readme eg. add task tags, model name etc.
-	d = ModelDownloader()
-	corpus_name = d.query("corpus",name=model_name)[0]
-	task_name = d.query("task",name=model_name)[0]
-	url_name = d.query("url",name=model_name)[0].split("files/")[0]
-	user_name = model_name.split("/")[0]
-	lang_name = d.query("lang",name=model_name)[0].replace("jp","ja")
-	template_Readme=open("TEMPLATE_Readme.md")
-	new_Readme=open(repo_name+"/README.md","w")
-	lines_arr=[line for line in template_Readme]
-	line_final_arr=[]
-	for line in lines_arr:
-		if "<add_more_tags>" in line:
-			if task_name=="asr":
-				line=line.replace("<add_more_tags>","automatic-speech-recognition")
-			elif task_name=="tts":
-				line=line.replace("<add_more_tags>","text-to-speech")
-			elif task_name=="enh":
-				line=line.replace("<add_more_tags>","speech-enhancement\n- audio-to-audio")
-		if "<add_lang>" in line:
-			if lang_name=="multilingual":
-				line=line.replace("<add_lang>","en\n- zh\n- ja\n- multilingual")
-			else:
-				line=line.replace("<add_lang>",lang_name)
-		line=line.replace("<add_model_name>",model_name)
-		line=line.replace("<add_url>",url_name)
-		line=line.replace("<add_name>",user_name)
-		line=line.replace("<add_corpus>",corpus_name)
-		line=line.replace("<add_task_name>",task_name.upper())
-		line=line.replace("<add_recipe_task_name>",task_name.lower()+"1")
-		if "<add_tts_reference>" in line:
-			if task_name=="tts":
-				line=line.replace("<add_tts_reference>",tts_reference)
-			else:
-				line=line.replace("<add_tts_reference>","")
-		new_Readme.write(line)
+
+def create_Readme_file(repo_name, model_name):
+    # Fill in the blanks in the template Readme eg. add task tags, model name etc.
+    d = ModelDownloader()
+    corpus_name = d.query("corpus", name=model_name)[0]
+    task_name = d.query("task", name=model_name)[0]
+    url_name = d.query("url", name=model_name)[0].split("files/")[0]
+    user_name = model_name.split("/")[0]
+    lang_name = d.query("lang", name=model_name)[0].replace("jp", "ja")
+    template_Readme = open("TEMPLATE_Readme.md")
+    new_Readme = open(repo_name + "/README.md", "w")
+    lines_arr = [line for line in template_Readme]
+    line_final_arr = []
+    for line in lines_arr:
+        if "<add_more_tags>" in line:
+            if task_name == "asr":
+                line = line.replace("<add_more_tags>", "automatic-speech-recognition")
+            elif task_name == "tts":
+                line = line.replace("<add_more_tags>", "text-to-speech")
+            elif task_name == "enh":
+                line = line.replace(
+                    "<add_more_tags>", "speech-enhancement\n- audio-to-audio"
+                )
+        if "<add_lang>" in line:
+            if lang_name == "multilingual":
+                line = line.replace("<add_lang>", "en\n- zh\n- ja\n- multilingual")
+            else:
+                line = line.replace("<add_lang>", lang_name)
+        line = line.replace("<add_model_name>", model_name)
+        line = line.replace("<add_url>", url_name)
+        line = line.replace("<add_name>", user_name)
+        line = line.replace("<add_corpus>", corpus_name)
+        line = line.replace("<add_task_name>", task_name.upper())
+        line = line.replace("<add_recipe_task_name>", task_name.lower() + "1")
+        if "<add_tts_reference>" in line:
+            if task_name == "tts":
+                line = line.replace("<add_tts_reference>", tts_reference)
+            else:
+                line = line.replace("<add_tts_reference>", "")
+        new_Readme.write(line)
+
 
 if __name__ == "__main__":
-	repo_name=sys.argv[1]
-	model_name=sys.argv[2]
-	create_Readme_file(repo_name,model_name)
\ No newline at end of file
+    repo_name = sys.argv[1]
+    model_name = sys.argv[2]
+    create_Readme_file(repo_name, model_name)
diff --git a/egs2/TEMPLATE/asr1/scripts/utils/get_model_names.py b/egs2/TEMPLATE/asr1/scripts/utils/get_model_names.py
index de6d6358616..8e0e863fc0c 100644
--- a/egs2/TEMPLATE/asr1/scripts/utils/get_model_names.py
+++ b/egs2/TEMPLATE/asr1/scripts/utils/get_model_names.py
@@ -1,6 +1,7 @@
 from espnet_model_zoo.downloader import ModelDownloader
 import sys
-model_name=sys.argv[1]
+
+model_name = sys.argv[1]
 d = ModelDownloader()
 model_path = d.download(sys.argv[1])
-print(model_path)
\ No newline at end of file
+print(model_path)
diff --git a/egs2/TEMPLATE/asr1/scripts/utils/perturb_data_dir_speed.sh b/egs2/TEMPLATE/asr1/scripts/utils/perturb_data_dir_speed.sh
index 9e08dba72bb..972eaf785c5 100755
--- a/egs2/TEMPLATE/asr1/scripts/utils/perturb_data_dir_speed.sh
+++ b/egs2/TEMPLATE/asr1/scripts/utils/perturb_data_dir_speed.sh
@@ -23,6 +23,9 @@
 export LC_ALL=C
 set -euo pipefail
 
+utt_extra_files=
+. utils/parse_options.sh
+
 if [[ $# != 3 ]]; then
     echo "Usage: perturb_data_dir_speed.sh <warping-factor> <srcdir> <destdir>"
     echo "e.g.:"
@@ -100,17 +103,17 @@ else # no segments->wav indexed by utterance.
     fi
 fi
 
-if [[ -f ${srcdir}/text ]]; then
-    utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/text >"${destdir}"/text
-fi
+for x in text utt2lang ${utt_extra_files}; do
+    echo ${x}
+    if [[ -f ${srcdir}/${x} ]]; then
+        utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/${x} >"${destdir}"/${x}
+    fi
+done
 if [[ -f ${srcdir}/spk2gender ]]; then
     utils/apply_map.pl -f 1 "${destdir}"/spk_map <"${srcdir}"/spk2gender >"${destdir}"/spk2gender
 fi
-if [[ -f ${srcdir}/utt2lang ]]; then
-    utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/utt2lang >"${destdir}"/utt2lang
-fi
 
 rm "${destdir}"/spk_map "${destdir}"/utt_map "${destdir}"/reco_map 2>/dev/null
 echo "$0: generated speed-perturbed version of data in ${srcdir}, in ${destdir}"
-
+utils/fix_data_dir.sh "${destdir}"
 utils/validate_data_dir.sh --no-feats --no-text "${destdir}"
diff --git a/egs2/TEMPLATE/asr1/scripts/utils/show_asr_result.sh b/egs2/TEMPLATE/asr1/scripts/utils/show_asr_result.sh
index afa768bf5d5..9b8abb9d658 100755
--- a/egs2/TEMPLATE/asr1/scripts/utils/show_asr_result.sh
+++ b/egs2/TEMPLATE/asr1/scripts/utils/show_asr_result.sh
@@ -44,7 +44,16 @@ cat << EOF
 EOF
 
 while IFS= read -r expdir; do
-    if ls "${expdir}"/*/*/score_*/result.txt &> /dev/null; then
+    
+      if ls "${expdir}"/*/*/result.sum &> /dev/null; then
+	echo "## $(basename ${expdir})"
+	cat << EOF
+|dataset|ROUGE-1|ROUGE-2|ROUGE-L|METEOR|BERTScore|
+|---|---|---|---|---|---|
+EOF
+	grep -H -e "RESULT" "${expdir}"/*/*/result.sum | sed 's=RESULT==g' |  cut -d ' ' -f 1,2- | tr ' ' '|'
+	echo  
+      elif ls "${expdir}"/*/*/score_*/result.txt &> /dev/null; then
         echo "## $(basename ${expdir})"
         for type in wer cer ter; do
                 	cat << EOF
diff --git a/egs2/TEMPLATE/asr1/scripts/utils/show_translation_result.sh b/egs2/TEMPLATE/asr1/scripts/utils/show_translation_result.sh
new file mode 100755
index 00000000000..125bc6e8910
--- /dev/null
+++ b/egs2/TEMPLATE/asr1/scripts/utils/show_translation_result.sh
@@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+mindepth=0
+maxdepth=1
+case=tc
+
+. utils/parse_options.sh
+
+if [ $# -gt 1 ]; then
+    echo "Usage: $0 --mindepth 0 --maxdepth 1 [exp]" 1>&2
+    echo ""
+    echo "Show the system environments and the evaluation results in Markdown format."
+    echo 'The default of <exp> is "exp/".'
+    exit 1
+fi
+
+[ -f ./path.sh ] && . ./path.sh
+set -euo pipefail
+if [ $# -eq 1 ]; then
+    exp=$1
+else
+    exp=exp
+fi
+
+
+cat << EOF
+<!-- Generated by $0 -->
+# RESULTS
+## Environments
+- date: \`$(LC_ALL=C date)\`
+EOF
+
+python3 << EOF
+import sys, espnet, torch
+pyversion = sys.version.replace('\n', ' ')
+
+print(f"""- python version: \`{pyversion}\`
+- espnet version: \`espnet {espnet.__version__}\`
+- pytorch version: \`pytorch {torch.__version__}\`""")
+EOF
+
+cat << EOF
+- Git hash: \`$(git rev-parse HEAD)\`
+  - Commit date: \`$(git log -1 --format='%cd')\`
+
+EOF
+
+# only show BLEU score for now
+metrics="bleu"
+while IFS= read -r expdir; do
+    if ls "${expdir}"/*/*/score_*/result.${case}.txt &> /dev/null; then
+        echo "## $(basename ${expdir})"
+        for type in ${metrics}; do
+            cat << EOF
+
+### ${type^^}
+
+|dataset|score|verbose_score|
+|---|---|---|
+EOF
+
+            for result in "${expdir}"/*/*/score_"${type}"/result."${case}".txt; do
+                inference_tag=$(echo "${result}" | rev | cut -d/ -f4 | rev)
+                test_set=$(echo "${result}" | rev | cut -d/ -f3 | rev)
+                score=$(sed -n '5p' "${result}" | cut -d ' ' -f 3 | tr -d ',')
+                verbose=$(sed -n '7p' "${result}" | cut -d ' ' -f 3- | tr -d '",')
+                echo "|${inference_tag}/${test_set}|${score}|${verbose}|"
+            done
+        done
+    fi
+done < <(find ${exp} -mindepth ${mindepth} -maxdepth ${maxdepth} -type d)
diff --git a/egs2/TEMPLATE/diar1/pyscripts/utils/convert_rttm.py b/egs2/TEMPLATE/diar1/pyscripts/utils/convert_rttm.py
index 5230ce83a8c..d5d4b257b36 100755
--- a/egs2/TEMPLATE/diar1/pyscripts/utils/convert_rttm.py
+++ b/egs2/TEMPLATE/diar1/pyscripts/utils/convert_rttm.py
@@ -61,13 +61,12 @@ def convert_rttm_text(
                 utt_id, path
             )
 
-            array, rate = soundfile.read(wav_path, always_2d=True)
-            assert rate == sampling_rate
-            shape = array.shape[0]
+            sf = soundfile.SoundFile(wav_path)
+            assert sf.samplerate == sampling_rate
             output_handler.write(
                 (
                     "{} {} <NA> <NA> {} <NA> <NA> <NA> <NA>\n".format(
-                        "END", utt_id, shape
+                        "END", utt_id, sf.frames
                     )
                 )
             )
diff --git a/egs2/TEMPLATE/diar1/pyscripts/utils/make_rttm.py b/egs2/TEMPLATE/diar1/pyscripts/utils/make_rttm.py
index 9ee2ee1cf51..f8b9c8c05af 100755
--- a/egs2/TEMPLATE/diar1/pyscripts/utils/make_rttm.py
+++ b/egs2/TEMPLATE/diar1/pyscripts/utils/make_rttm.py
@@ -9,6 +9,7 @@
 import logging
 import numpy as np
 from scipy.signal import medfilt
+import humanfriendly
 
 
 def get_parser() -> argparse.Namespace:
@@ -21,7 +22,7 @@ def get_parser() -> argparse.Namespace:
     parser.add_argument("--frame_shift", default=128, type=int)
     parser.add_argument("--subsampling", default=1, type=int)
     parser.add_argument("--median", default=1, type=int)
-    parser.add_argument("--sampling_rate", default=8000, type=int)
+    parser.add_argument("--sampling_rate", default="8000", type=str)
     parser.add_argument(
         "--verbose",
         default=1,
@@ -34,7 +35,7 @@ def get_parser() -> argparse.Namespace:
 def main():
     """Make rttm based on diarization inference results"""
     args = get_parser().parse_args()
-
+    sampling_rate = humanfriendly.parse_size(args.sampling_rate)
     # logging info
     if args.verbose > 1:
         logging.basicConfig(
@@ -61,7 +62,7 @@ def main():
             a = np.where(data[:] > args.threshold, 1, 0)
             if args.median > 1:
                 a = medfilt(a, (args.median, 1))
-            factor = args.frame_shift * args.subsampling / args.sampling_rate
+            factor = args.frame_shift * args.subsampling / sampling_rate
             for spkid, frames in enumerate(a.T):
                 frames = np.pad(frames, (1, 1), "constant")
                 (changes,) = np.where(np.diff(frames, axis=0) != 0)
diff --git a/egs2/TEMPLATE/diar1/scripts/utils/create_README_file.py b/egs2/TEMPLATE/diar1/scripts/utils/create_README_file.py
index a18aed64ab6..0fe3405603d 120000
--- a/egs2/TEMPLATE/diar1/scripts/utils/create_README_file.py
+++ b/egs2/TEMPLATE/diar1/scripts/utils/create_README_file.py
@@ -1 +1 @@
-egs2/TEMPLATE/asr1/scripts/utils/create_README_file.py
\ No newline at end of file
+../../../asr1/scripts/utils/create_README_file.py
\ No newline at end of file
diff --git a/egs2/TEMPLATE/diar1/scripts/utils/get_model_names.py b/egs2/TEMPLATE/diar1/scripts/utils/get_model_names.py
index 0b4eaaf09a8..b163314a6c5 120000
--- a/egs2/TEMPLATE/diar1/scripts/utils/get_model_names.py
+++ b/egs2/TEMPLATE/diar1/scripts/utils/get_model_names.py
@@ -1 +1 @@
-egs2/TEMPLATE/asr1/scripts/utils/get_model_names.py
\ No newline at end of file
+../../../asr1/scripts/utils/get_model_names.py
\ No newline at end of file
diff --git a/egs2/TEMPLATE/diar1/scripts/utils/score_der.sh b/egs2/TEMPLATE/diar1/scripts/utils/score_der.sh
index d53260101af..fadb5ff3931 100755
--- a/egs2/TEMPLATE/diar1/scripts/utils/score_der.sh
+++ b/egs2/TEMPLATE/diar1/scripts/utils/score_der.sh
@@ -8,7 +8,7 @@ frame_shift=128
 fs=8000
 subsampling=1
 
-./utils/parse_options.sh || exit 1
+. ./utils/parse_options.sh || exit 1
 
 if [ $# -lt 3 ]; then
     echo "Usage: $0 <scoring_dir> <infer_scp> <gt_label>";
diff --git a/egs2/TEMPLATE/enh1/README.md b/egs2/TEMPLATE/enh1/README.md
index 2d7e7aa542b..1b9984979d4 100644
--- a/egs2/TEMPLATE/enh1/README.md
+++ b/egs2/TEMPLATE/enh1/README.md
@@ -40,6 +40,7 @@ Format scp files such as `wav.scp`. The scp files include:
   + `spk{}.scp`: wav file list of speech reference signals. {} can be 1, 2, ..., depending on the number of speakers in the input signal in `wav.scp`.
   + `noise{}.scp` (optional): wav file list of noise reference signals. {} can be 1, 2, ..., depending on the number of noise types in the input signal in `wav.scp`. The file(s) are required when `--use_noise_ref true` is specified. Also related to the variable `noise_type_num`.
   + `dereverb{}.scp` (optional): wav file list of dereverberation reference signals (for training a dereverberation model). This file is required when `--use_dereverb_ref true` is specified. Also related to the variable `dereverb_ref_num`.
+  + `utt2category`: (optional) the category info of each utterance. This file can help the batch sampler to load the same category utterances in each batch. One usage case is that users want to load the simulation data and real data in different batches.
 
 #### Stage 4: Remove short data
 This stage is same as that in ASR recipe.
diff --git a/egs2/TEMPLATE/enh1/enh.sh b/egs2/TEMPLATE/enh1/enh.sh
index ff8c8040206..db170043db6 100755
--- a/egs2/TEMPLATE/enh1/enh.sh
+++ b/egs2/TEMPLATE/enh1/enh.sh
@@ -72,12 +72,14 @@ init_param=
 
 # Enhancement related
 inference_args="--normalize_output_wav true"
-inference_model=valid.si_snr.ave.pth
+inference_model=valid.loss.ave.pth
 download_model=
 
 # Evaluation related
-scoring_protocol="STOI SDR SAR SIR"
+scoring_protocol="STOI SDR SAR SIR SI_SNR"
 ref_channel=0
+inference_tag=  # Prefix to the result dir for ENH inference.
+inference_enh_config= # Config for enhancement.
 score_with_asr=false
 asr_exp=""       # asr model for scoring WER
 lm_exp=""       # lm model for scoring WER
@@ -151,8 +153,9 @@ Options:
     --init_param    # pretrained model path and module name (default="${init_param}")
 
     # Enhancement related
-    --inference_args   # Arguments for enhancement in the inference stage (default="${inference_args}")
-    --inference_model  # Enhancement model path for inference (default="${inference_model}").
+    --inference_args       # Arguments for enhancement in the inference stage (default="${inference_args}")
+    --inference_model      # Enhancement model path for inference (default="${inference_model}").
+    --inference_enh_config # Configuration file for overwriting some model attributes during SE inference. (default="${inference_enh_config}")
 
     # Evaluation related
     --scoring_protocol    # Metrics to be used for scoring (default="${scoring_protocol}")
@@ -201,6 +204,9 @@ fi
 [ -z "${valid_set}" ] &&   { log "${help_message}"; log "Error: --valid_set is required"  ; exit 2; };
 [ -z "${test_sets}" ] && { log "${help_message}"; log "Error: --test_sets is required"; exit 2; };
 
+# Extra files for enhancement process
+utt_extra_files="utt2category"
+
 data_feats=${dumpdir}/raw
 
 
@@ -247,6 +253,14 @@ if [ -n "${speed_perturb_factors}" ]; then
   enh_exp="${enh_exp}_sp"
 fi
 
+if [ -z "${inference_tag}" ]; then
+    if [ -n "${inference_enh_config}" ]; then
+        inference_tag="$(basename "${inference_enh_config}" .yaml)"
+    else
+        inference_tag=enhanced
+    fi
+fi
+
 # ========================== Main stages start from here. ==========================
 
 if ! "${skip_data_prep}"; then
@@ -267,7 +281,7 @@ if ! "${skip_data_prep}"; then
 
            for factor in ${speed_perturb_factors}; do
                if [[ $(bc <<<"${factor} != 1.0") == 1 ]]; then
-                   scripts/utils/perturb_enh_data_dir_speed.sh "${factor}" "data/${train_set}" "data/${train_set}_sp${factor}" "${_scp_list}"
+                   scripts/utils/perturb_enh_data_dir_speed.sh --utt_extra_files "${utt_extra_files}" "${factor}" "data/${train_set}" "data/${train_set}_sp${factor}" "${_scp_list}"
                    _dirs+="data/${train_set}_sp${factor} "
                else
                    # If speed factor is 1, same as the original
@@ -614,7 +628,7 @@ if ! "${skip_eval}"; then
 
         for dset in "${valid_set}" ${test_sets}; do
             _data="${data_feats}/${dset}"
-            _dir="${enh_exp}/enhanced_${dset}"
+            _dir="${enh_exp}/${inference_tag}_${dset}"
             _logdir="${_dir}/logdir"
             mkdir -p "${_logdir}"
 
@@ -646,6 +660,7 @@ if ! "${skip_eval}"; then
                     --data_path_and_name_and_type "${_data}/${_scp},speech_mix,${_type}" \
                     --key_file "${_logdir}"/keys.JOB.scp \
                     --train_config "${enh_exp}"/config.yaml \
+                    ${inference_enh_config:+--inference_config "$inference_enh_config"} \
                     --model_file "${enh_exp}"/"${inference_model}" \
                     --output_dir "${_logdir}"/output.JOB \
                     ${_opts} ${inference_args}
@@ -686,7 +701,7 @@ if ! "${skip_eval}"; then
                 if "${score_obs}"; then
                     _dir="${data_feats}/${dset}/scoring"
                 else
-                    _dir="${enh_exp}/enhanced_${dset}/scoring"
+                    _dir="${enh_exp}/${inference_tag}_${dset}/scoring"
                 fi
 
                 _logdir="${_dir}/logdir"
@@ -713,7 +728,7 @@ if ! "${skip_eval}"; then
                         # To compute the score of observation, input original wav.scp
                         _inf_scp+="--inf_scp ${data_feats}/${dset}/wav.scp "
                     else
-                        _inf_scp+="--inf_scp ${enh_exp}/enhanced_${dset}/spk${spk}.scp "
+                        _inf_scp+="--inf_scp ${enh_exp}/${inference_tag}_${dset}/spk${spk}.scp "
                     fi
                 done
 
@@ -749,7 +764,7 @@ if ! "${skip_eval}"; then
             ./scripts/utils/show_enh_score.sh "${_dir}/../.." > "${_dir}/../../RESULTS.md"
         done
         log "Evaluation result for observation: ${data_feats}/RESULTS.md"
-        log "Evaluation result for enhancement: ${enh_exp}/enhanced/RESULTS.md"
+        log "Evaluation result for enhancement: ${enh_exp}/RESULTS.md"
 
     fi
 else
@@ -808,7 +823,7 @@ if "${score_with_asr}"; then
                         # Using same wav.scp for all speakers
                         cp "${_data}/wav.scp" "${_ddir}/wav.scp"
                     else
-                        cp "${enh_exp}/enhanced_${dset}/scoring/wav_spk${spk}" "${_ddir}/wav.scp"
+                        cp "${enh_exp}/${inference_tag}_${dset}/scoring/wav_spk${spk}" "${_ddir}/wav.scp"
                     fi
                     cp data/${dset}/text_spk${spk} ${_ddir}/text
                     cp ${_data}/{spk2utt,utt2spk,utt2num_samples,feats_type} ${_ddir}
diff --git a/egs2/TEMPLATE/enh1/scripts/utils/create_README_file.py b/egs2/TEMPLATE/enh1/scripts/utils/create_README_file.py
index a18aed64ab6..0fe3405603d 120000
--- a/egs2/TEMPLATE/enh1/scripts/utils/create_README_file.py
+++ b/egs2/TEMPLATE/enh1/scripts/utils/create_README_file.py
@@ -1 +1 @@
-egs2/TEMPLATE/asr1/scripts/utils/create_README_file.py
\ No newline at end of file
+../../../asr1/scripts/utils/create_README_file.py
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh1/scripts/utils/get_model_names.py b/egs2/TEMPLATE/enh1/scripts/utils/get_model_names.py
index 0b4eaaf09a8..b163314a6c5 120000
--- a/egs2/TEMPLATE/enh1/scripts/utils/get_model_names.py
+++ b/egs2/TEMPLATE/enh1/scripts/utils/get_model_names.py
@@ -1 +1 @@
-egs2/TEMPLATE/asr1/scripts/utils/get_model_names.py
\ No newline at end of file
+../../../asr1/scripts/utils/get_model_names.py
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh1/scripts/utils/perturb_enh_data_dir_speed.sh b/egs2/TEMPLATE/enh1/scripts/utils/perturb_enh_data_dir_speed.sh
index 1d0a0fc3c3b..04887e10f30 100755
--- a/egs2/TEMPLATE/enh1/scripts/utils/perturb_enh_data_dir_speed.sh
+++ b/egs2/TEMPLATE/enh1/scripts/utils/perturb_enh_data_dir_speed.sh
@@ -27,6 +27,9 @@
 export LC_ALL=C
 set -euo pipefail
 
+utt_extra_files=
+. utils/parse_options.sh
+
 if [[ $# != 4 ]]; then
     echo "Usage: perturb_data_dir_speed.sh <warping-factor> <srcdir> <destdir> <scp_files>"
     echo "e.g.:"
@@ -108,17 +111,15 @@ for scp_file in ${scp_files};do
   fi
 done
 
-if [[ -f ${srcdir}/text ]]; then
-    utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/text >"${destdir}"/text
-fi
+for x in text utt2lang ${utt_extra_files}; do
+    if [[ -f ${srcdir}/${x} ]]; then
+        utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/${x} >"${destdir}"/${x}
+    fi
+done
 if [[ -f ${srcdir}/spk2gender ]]; then
     utils/apply_map.pl -f 1 "${destdir}"/spk_map <"${srcdir}"/spk2gender >"${destdir}"/spk2gender
 fi
-if [[ -f ${srcdir}/utt2lang ]]; then
-    utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/utt2lang >"${destdir}"/utt2lang
-fi
-
 rm "${destdir}"/spk_map "${destdir}"/utt_map "${destdir}"/reco_map 2>/dev/null
 echo "$0: generated speed-perturbed version of data in ${srcdir}, in ${destdir}"
-
+utils/fix_data_dir.sh "${destdir}"
 utils/validate_data_dir.sh --no-feats --no-text "${destdir}"
diff --git a/egs2/TEMPLATE/enh1/scripts/utils/show_enh_score.sh b/egs2/TEMPLATE/enh1/scripts/utils/show_enh_score.sh
index 2a453dab44c..289874573ea 100755
--- a/egs2/TEMPLATE/enh1/scripts/utils/show_enh_score.sh
+++ b/egs2/TEMPLATE/enh1/scripts/utils/show_enh_score.sh
@@ -15,7 +15,7 @@ fi
 [ -f ./path.sh ] && . ./path.sh
 set -euo pipefail
 if [ $# -eq 1 ]; then
-    exp=$1
+    exp=$(realpath "$1")
 else
     exp=exp
 fi
@@ -51,7 +51,7 @@ while IFS= read -r expdir; do
         metrics=()
         heading="\n|dataset|"
         sep="|---|"
-        for type in pesq stoi sar sdr sir si_snr; do
+        for type in pesq estoi stoi sar sdr sir si_snr; do
             if ls "${expdir}"/*/scoring/result_${type}.txt &> /dev/null; then
                 metrics+=("$type")
                 heading+="${type^^}|"
@@ -69,7 +69,11 @@ while IFS= read -r expdir; do
             line="|${dset}|"
             for ((i=0; i<${#metrics[@]}; i++)); do
                 type=${metrics[$i]}
-                score=$(head -n1 "${expdir}"/${dset}/scoring/result_${type}.txt)
+                if [ -f "${expdir}"/${dset}/scoring/result_${type}.txt ]; then
+                    score=$(head -n1 "${expdir}"/${dset}/scoring/result_${type}.txt)
+                else
+                    score=""
+                fi
                 line+="${score}|"
             done
             echo $line
diff --git a/egs2/TEMPLATE/enh_asr1/cmd.sh b/egs2/TEMPLATE/enh_asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/TEMPLATE/enh_asr1/conf/fbank.conf b/egs2/TEMPLATE/enh_asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/TEMPLATE/enh_asr1/conf/pbs.conf b/egs2/TEMPLATE/enh_asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/TEMPLATE/enh_asr1/conf/pitch.conf b/egs2/TEMPLATE/enh_asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/TEMPLATE/enh_asr1/conf/queue.conf b/egs2/TEMPLATE/enh_asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/TEMPLATE/enh_asr1/conf/slurm.conf b/egs2/TEMPLATE/enh_asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/TEMPLATE/enh_asr1/db.sh b/egs2/TEMPLATE/enh_asr1/db.sh
new file mode 120000
index 00000000000..318d781d123
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/db.sh
@@ -0,0 +1 @@
+../asr1/db.sh
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_asr1/enh_asr.sh b/egs2/TEMPLATE/enh_asr1/enh_asr.sh
new file mode 100755
index 00000000000..fc720ddf94b
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/enh_asr.sh
@@ -0,0 +1,1655 @@
+#!/usr/bin/env bash
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+min() {
+  local a b
+  a=$1
+  for b in "$@"; do
+      if [ "${b}" -le "${a}" ]; then
+          a="${b}"
+      fi
+  done
+  echo "${a}"
+}
+SECONDS=0
+
+# General configuration
+stage=1              # Processes starts from the specified stage.
+stop_stage=10000     # Processes is stopped at the specified stage.
+skip_data_prep=false # Skip data preparation stages.
+skip_train=false     # Skip training stages.
+skip_eval=false      # Skip decoding and evaluation stages.
+skip_upload_hf=true  # Skip uploading to hugging face stages.
+ngpu=1               # The number of gpus ("0" uses cpu, otherwise use gpu).
+num_nodes=1          # The number of nodes.
+nj=32                # The number of parallel jobs.
+inference_nj=32      # The number of parallel jobs in decoding.
+gpu_inference=false  # Whether to perform gpu decoding.
+dumpdir=dump         # Directory to dump features.
+expdir=exp           # Directory to save experiments.
+python=python3       # Specify python to execute espnet commands.
+
+# Data preparation related
+local_data_opts= # The options given to local/data.sh.
+
+# Speed perturbation related
+speed_perturb_factors=  # perturbation factors, e.g. "0.9 1.0 1.1" (separated by space).
+
+# Feature extraction related
+feats_type=raw       # Feature type (raw or fbank_pitch).
+audio_format=flac    # Audio format: wav, flac, wav.ark, flac.ark  (only in feats_type=raw).
+fs=16k               # Sampling rate.
+min_wav_duration=0.1 # Minimum duration in second.
+max_wav_duration=20  # Maximum duration in second.
+
+# Tokenization related
+token_type=bpe      # Tokenization type (char or bpe).
+nbpe=30             # The number of BPE vocabulary.
+bpemode=unigram     # Mode of BPE (unigram or bpe).
+oov="<unk>"         # Out of vocabulary symbol.
+blank="<blank>"     # CTC blank symbol
+sos_eos="<sos/eos>" # sos and eos symbole
+bpe_input_sentence_size=100000000 # Size of input sentence for BPE.
+bpe_nlsyms=         # non-linguistic symbols list, separated by a comma, for BPE
+bpe_char_cover=1.0  # character coverage when modeling BPE
+
+# Ngram model related
+use_ngram=false
+ngram_exp=
+ngram_num=3
+
+# Language model related
+use_lm=true       # Use language model for ASR decoding.
+lm_tag=           # Suffix to the result dir for language model training.
+lm_exp=           # Specify the directory path for LM experiment.
+                  # If this option is specified, lm_tag is ignored.
+lm_stats_dir=     # Specify the directory path for LM statistics.
+lm_config=        # Config for language model training.
+lm_args=          # Arguments for language model training, e.g., "--max_epoch 10".
+                  # Note that it will overwrite args in lm config.
+use_word_lm=false # Whether to use word language model.
+num_splits_lm=1   # Number of splitting for lm corpus.
+# shellcheck disable=SC2034
+word_vocab_size=10000 # Size of word vocabulary.
+
+# ASR model related
+enh_asr_tag=       # Suffix to the result dir for asr model training.
+enh_asr_exp=       # Specify the directory path for ASR experiment.
+                   # If this option is specified, enh_asr_tag is ignored.
+enh_asr_stats_dir= # Specify the directory path for ASR statistics.
+enh_asr_config=    # Config for asr model training.
+enh_asr_args=      # Arguments for asr model training, e.g., "--max_epoch 10".
+                   # Note that it will overwrite args in asr config.
+pretrained_model=          # Pretrained model to load
+ignore_init_mismatch=false # Ignore initial mismatch
+feats_normalize=global_mvn # Normalizaton layer type.
+num_splits_asr=1           # Number of splitting for lm corpus.
+
+# Upload model related
+hf_repo=
+
+# Decoding related
+use_k2=false      # Whether to use k2 based decoder
+batch_size=1
+inference_tag=    # Suffix to the result dir for decoding.
+inference_config= # Config for decoding.
+asr_inference_args= # Arguments for decoding, e.g., "--lm_weight 0.1".
+                    # Note that it will overwrite args in inference config.
+enh_inference_args="--normalize_output_wav true"
+inference_lm=valid.loss.ave.pth       # Language model path for decoding.
+inference_ngram=${ngram_num}gram.bin
+inference_enh_asr_model=valid.acc.ave.pth # ASR model path for decoding.
+                                          # e.g.
+                                          # inference_enh_asr_model=train.loss.best.pth
+                                          # inference_enh_asr_model=3epoch.pth
+                                          # inference_enh_asr_model=valid.acc.best.pth
+                                          # inference_enh_asr_model=valid.loss.ave.pth
+download_model= # Download a model from Model Zoo and use it for decoding.
+
+# Enhancement related arguments
+spk_num=1   # Number of speakers
+noise_type_num=1
+dereverb_ref_num=1
+# Evaluation related
+scoring_protocol="STOI SDR SAR SIR SI_SNR"
+ref_channel=0
+inference_enh_tag=      # Prefix to the result dir for ENH inference.
+inference_enh_config=   # Config for enhancement.
+
+# Enh Training data related
+use_dereverb_ref=false
+use_noise_ref=false
+
+# [Task dependent] Set the datadir name created by local/data.sh
+train_set=       # Name of training set.
+valid_set=       # Name of validation set used for monitoring/tuning network training.
+test_sets=       # Names of test sets. Multiple items (e.g., both dev and eval sets) can be specified.
+bpe_train_text=  # Text file path of bpe training set.
+lm_train_text=   # Text file path of language model training set.
+lm_dev_text=     # Text file path of language model development set.
+lm_test_text=    # Text file path of language model evaluation set.
+nlsyms_txt=none  # Non-linguistic symbol list if existing.
+cleaner=none     # Text cleaner.
+g2p=none         # g2p method (needed if token_type=phn).
+lang=noinfo      # The language type of corpus.
+score_opts=                # The options given to sclite scoring
+local_score_opts=          # The options given to local/score.sh.
+enh_asr_speech_fold_length=800 # fold_length for speech data during ASR training.
+enh_asr_text_fold_length=150   # fold_length for text data during ASR training.
+lm_fold_length=150         # fold_length for LM training.
+
+help_message=$(cat << EOF
+Usage: $0 --train-set "<train_set_name>" --valid-set "<valid_set_name>" --test_sets "<test_set_names>"
+
+Options:
+    # General configuration
+    --stage          # Processes starts from the specified stage (default="${stage}").
+    --stop_stage     # Processes is stopped at the specified stage (default="${stop_stage}").
+    --skip_data_prep # Skip data preparation stages (default="${skip_data_prep}").
+    --skip_train     # Skip training stages (default="${skip_train}").
+    --skip_eval      # Skip decoding and evaluation stages (default="${skip_eval}").
+    --skip_upload_hf    # Skip packing and uploading stages (default="${skip_upload_hf}").
+    --ngpu           # The number of gpus ("0" uses cpu, otherwise use gpu, default="${ngpu}").
+    --num_nodes      # The number of nodes (default="${num_nodes}").
+    --nj             # The number of parallel jobs (default="${nj}").
+    --inference_nj   # The number of parallel jobs in decoding (default="${inference_nj}").
+    --gpu_inference  # Whether to perform gpu decoding (default="${gpu_inference}").
+    --dumpdir        # Directory to dump features (default="${dumpdir}").
+    --expdir         # Directory to save experiments (default="${expdir}").
+    --python         # Specify python to execute espnet commands (default="${python}").
+
+    # Data preparation related
+    --local_data_opts # The options given to local/data.sh (default="${local_data_opts}").
+
+    # Speed perturbation related
+    --speed_perturb_factors # speed perturbation factors, e.g. "0.9 1.0 1.1" (separated by space, default="${speed_perturb_factors}").
+
+    # Feature extraction related
+    --feats_type       # Feature type (raw, fbank_pitch or extracted, default="${feats_type}").
+    --audio_format     # Audio format: wav, flac, wav.ark, flac.ark  (only in feats_type=raw, default="${audio_format}").
+    --fs               # Sampling rate (default="${fs}").
+    --min_wav_duration # Minimum duration in second (default="${min_wav_duration}").
+    --max_wav_duration # Maximum duration in second (default="${max_wav_duration}").
+
+    # Tokenization related
+    --token_type              # Tokenization type (char or bpe, default="${token_type}").
+    --nbpe                    # The number of BPE vocabulary (default="${nbpe}").
+    --bpemode                 # Mode of BPE (unigram or bpe, default="${bpemode}").
+    --oov                     # Out of vocabulary symbol (default="${oov}").
+    --blank                   # CTC blank symbol (default="${blank}").
+    --sos_eos                 # sos and eos symbole (default="${sos_eos}").
+    --bpe_input_sentence_size # Size of input sentence for BPE (default="${bpe_input_sentence_size}").
+    --bpe_nlsyms              # Non-linguistic symbol list for sentencepiece, separated by a comma. (default="${bpe_nlsyms}").
+    --bpe_char_cover          # Character coverage when modeling BPE (default="${bpe_char_cover}").
+
+    # Language model related
+    --lm_tag          # Suffix to the result dir for language model training (default="${lm_tag}").
+    --lm_exp          # Specify the directory path for LM experiment.
+                      # If this option is specified, lm_tag is ignored (default="${lm_exp}").
+    --lm_stats_dir    # Specify the directory path for LM statistics (default="${lm_stats_dir}").
+    --lm_config       # Config for language model training (default="${lm_config}").
+    --lm_args         # Arguments for language model training (default="${lm_args}").
+                      # e.g., --lm_args "--max_epoch 10"
+                      # Note that it will overwrite args in lm config.
+    --use_word_lm     # Whether to use word language model (default="${use_word_lm}").
+    --word_vocab_size # Size of word vocabulary (default="${word_vocab_size}").
+    --num_splits_lm   # Number of splitting for lm corpus (default="${num_splits_lm}").
+
+    # ASR model related
+    --enh_asr_tag          # Suffix to the result dir for asr model training (default="${enh_asr_tag}").
+    --enh_asr_exp          # Specify the directory path for ASR experiment.
+                       # If this option is specified, enh_asr_tag is ignored (default="${enh_asr_exp}").
+    --enh_asr_stats_dir    # Specify the directory path for ASR statistics (default="${enh_asr_stats_dir}").
+    --enh_asr_config       # Config for asr model training (default="${enh_asr_config}").
+    --enh_asr_args         # Arguments for asr model training (default="${enh_asr_args}").
+                           # e.g., --enh_asr_args "--max_epoch 10"
+                           # Note that it will overwrite args in asr config.
+    --pretrained_model=          # Pretrained model to load (default="${pretrained_model}").
+    --ignore_init_mismatch=      # Ignore mismatch parameter init with pretrained model (default="${ignore_init_mismatch}").
+    --feats_normalize  # Normalizaton layer type (default="${feats_normalize}").
+    --num_splits_asr   # Number of splitting for lm corpus  (default="${num_splits_asr}").
+
+    # Decoding related
+    --inference_tag       # Suffix to the result dir for decoding (default="${inference_tag}").
+    --inference_config    # Config for decoding (default="${inference_config}").
+    --asr_inference_args      # Arguments for decoding (default="${asr_inference_args}").
+                              # e.g., --asr_inference_args "--lm_weight 0.1"
+                              # Note that it will overwrite args in inference config.
+    --enh_inference_args      # Arguments for enhancement (default="${enh_inference_args}").
+    --inference_lm        # Language model path for decoding (default="${inference_lm}").
+    --inference_enh_asr_model # ASR model path for decoding (default="${inference_enh_asr_model}").
+    --download_model      # Download a model from Model Zoo and use it for decoding (default="${download_model}").
+
+    --spk_num             # number of speakers
+    --noise_type_num   # Number of noise types in the input audio (default="${noise_type_num}")
+    --dereverb_ref_num # Number of references for dereverberation (default="${dereverb_ref_num}")
+    --use_dereverb_ref # Whether or not to use dereverberated signal as an additional reference
+                         for training a dereverberation model (default="${use_dereverb_ref}")
+    --use_noise_ref    # Whether or not to use noise signal as an additional reference
+                         for training a denoising model (default="${use_noise_ref}")
+    # Enhancement Evaluation related
+    --scoring_protocol    # Metrics to be used for scoring (default="${scoring_protocol}")
+    --ref_channel         # Reference channel of the reference speech will be used if the model
+                            output is single-channel and reference speech is multi-channel
+                            (default="${ref_channel}")
+    # [Task dependent] Set the datadir name created by local/data.sh
+    --train_set     # Name of training set (required).
+    --valid_set     # Name of validation set used for monitoring/tuning network training (required).
+    --test_sets     # Names of test sets.
+                    # Multiple items (e.g., both dev and eval sets) can be specified (required).
+    --bpe_train_text # Text file path of bpe training set.
+    --lm_train_text  # Text file path of language model training set.
+    --lm_dev_text   # Text file path of language model development set (default="${lm_dev_text}").
+    --lm_test_text  # Text file path of language model evaluation set (default="${lm_test_text}").
+    --nlsyms_txt    # Non-linguistic symbol list if existing (default="${nlsyms_txt}").
+    --cleaner       # Text cleaner (default="${cleaner}").
+    --g2p           # g2p method (default="${g2p}").
+    --lang          # The language type of corpus (default=${lang}).
+    --score_opts             # The options given to sclite scoring (default="{score_opts}").
+    --local_score_opts       # The options given to local/score.sh (default="{local_score_opts}").
+    --enh_asr_speech_fold_length # fold_length for speech data during ASR training (default="${enh_asr_speech_fold_length}").
+    --enh_asr_text_fold_length   # fold_length for text data during ASR training (default="${enh_asr_text_fold_length}").
+    --lm_fold_length         # fold_length for LM training (default="${lm_fold_length}").
+EOF
+)
+
+log "$0 $*"
+# Save command line args for logging (they will be lost after utils/parse_options.sh)
+run_args=$(pyscripts/utils/print_args.py $0 "$@")
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    log "${help_message}"
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+. ./path.sh
+. ./cmd.sh
+
+
+# Check required arguments
+[ -z "${train_set}" ] && { log "${help_message}"; log "Error: --train_set is required"; exit 2; };
+[ -z "${valid_set}" ] && { log "${help_message}"; log "Error: --valid_set is required"; exit 2; };
+[ -z "${test_sets}" ] && { log "${help_message}"; log "Error: --test_sets is required"; exit 2; };
+
+[ ${spk_num} -gt 1 ] && { log "${help_message}"; log "Error: --spk_num only 1 is supported"; exit 2; };
+
+# Check feature type
+if [ "${feats_type}" = raw ]; then
+    data_feats=${dumpdir}/raw
+elif [ "${feats_type}" = fbank_pitch ]; then
+    data_feats=${dumpdir}/fbank_pitch
+elif [ "${feats_type}" = fbank ]; then
+    data_feats=${dumpdir}/fbank
+elif [ "${feats_type}" == extracted ]; then
+    data_feats=${dumpdir}/extracted
+else
+    log "${help_message}"
+    log "Error: not supported: --feats_type ${feats_type}"
+    exit 2
+fi
+
+# Extra files for enhancement process
+utt_extra_files="utt2category text utt2lang"
+
+# Use the same text as ASR for bpe training if not specified.
+[ -z "${bpe_train_text}" ] && bpe_train_text="${data_feats}/${train_set}/text"
+# Use the same text as ASR for lm training if not specified.
+[ -z "${lm_train_text}" ] && lm_train_text="${data_feats}/${train_set}/text"
+# Use the same text as ASR for lm training if not specified.
+[ -z "${lm_dev_text}" ] && lm_dev_text="${data_feats}/${valid_set}/text"
+# Use the text of the 1st evaldir if lm_test is not specified
+[ -z "${lm_test_text}" ] && lm_test_text="${data_feats}/${test_sets%% *}/text"
+
+# Check tokenization type
+if [ "${lang}" != noinfo ]; then
+    token_listdir=data/${lang}_token_list
+else
+    token_listdir=data/token_list
+fi
+bpedir="${token_listdir}/bpe_${bpemode}${nbpe}"
+bpeprefix="${bpedir}"/bpe
+bpemodel="${bpeprefix}".model
+bpetoken_list="${bpedir}"/tokens.txt
+chartoken_list="${token_listdir}"/char/tokens.txt
+# NOTE: keep for future development.
+# shellcheck disable=SC2034
+wordtoken_list="${token_listdir}"/word/tokens.txt
+
+if [ "${token_type}" = bpe ]; then
+    token_list="${bpetoken_list}"
+elif [ "${token_type}" = char ]; then
+    token_list="${chartoken_list}"
+    bpemodel=none
+elif [ "${token_type}" = word ]; then
+    token_list="${wordtoken_list}"
+    bpemodel=none
+else
+    log "Error: not supported --token_type '${token_type}'"
+    exit 2
+fi
+if ${use_word_lm}; then
+    log "Error: Word LM is not supported yet"
+    exit 2
+
+    lm_token_list="${wordtoken_list}"
+    lm_token_type=word
+else
+    lm_token_list="${token_list}"
+    lm_token_type="${token_type}"
+fi
+
+
+# Set tag for naming of model directory
+if [ -z "${enh_asr_tag}" ]; then
+    if [ -n "${enh_asr_config}" ]; then
+        enh_asr_tag="$(basename "${enh_asr_config}" .yaml)_${feats_type}"
+    else
+        enh_asr_tag="train_${feats_type}"
+    fi
+    if [ "${lang}" != noinfo ]; then
+        enh_asr_tag+="_${lang}_${token_type}"
+    else
+        enh_asr_tag+="_${token_type}"
+    fi
+    if [ "${token_type}" = bpe ]; then
+        enh_asr_tag+="${nbpe}"
+    fi
+    # Add overwritten arg's info
+    if [ -n "${enh_asr_args}" ]; then
+        enh_asr_tag+="$(echo "${enh_asr_args}" | sed -e "s/--/\_/g" -e "s/[ |=/]//g")"
+    fi
+    if [ -n "${speed_perturb_factors}" ]; then
+        enh_asr_tag+="_sp"
+    fi
+fi
+if [ -z "${lm_tag}" ]; then
+    if [ -n "${lm_config}" ]; then
+        lm_tag="$(basename "${lm_config}" .yaml)"
+    else
+        lm_tag="train"
+    fi
+    if [ "${lang}" != noinfo ]; then
+        lm_tag+="_${lang}_${lm_token_type}"
+    else
+        lm_tag+="_${lm_token_type}"
+    fi
+    if [ "${lm_token_type}" = bpe ]; then
+        lm_tag+="${nbpe}"
+    fi
+    # Add overwritten arg's info
+    if [ -n "${lm_args}" ]; then
+        lm_tag+="$(echo "${lm_args}" | sed -e "s/--/\_/g" -e "s/[ |=/]//g")"
+    fi
+fi
+
+# The directory used for collect-stats mode
+if [ -z "${enh_asr_stats_dir}" ]; then
+    if [ "${lang}" != noinfo ]; then
+        enh_asr_stats_dir="${expdir}/enh_asr_stats_${feats_type}_${lang}_${token_type}"
+    else
+        enh_asr_stats_dir="${expdir}/enh_asr_stats_${feats_type}_${token_type}"
+    fi
+    if [ "${token_type}" = bpe ]; then
+        enh_asr_stats_dir+="${nbpe}"
+    fi
+    if [ -n "${speed_perturb_factors}" ]; then
+        enh_asr_stats_dir+="_sp"
+    fi
+fi
+if [ -z "${lm_stats_dir}" ]; then
+    if [ "${lang}" != noinfo ]; then
+        lm_stats_dir="${expdir}/lm_stats_${lang}_${lm_token_type}"
+    else
+        lm_stats_dir="${expdir}/lm_stats_${lm_token_type}"
+    fi
+    if [ "${lm_token_type}" = bpe ]; then
+        lm_stats_dir+="${nbpe}"
+    fi
+fi
+# The directory used for training commands
+if [ -z "${enh_asr_exp}" ]; then
+    enh_asr_exp="${expdir}/enh_asr_${enh_asr_tag}"
+fi
+if [ -z "${lm_exp}" ]; then
+    lm_exp="${expdir}/lm_${lm_tag}"
+fi
+if [ -z "${ngram_exp}" ]; then
+    ngram_exp="${expdir}/ngram"
+fi
+
+
+if [ -z "${inference_tag}" ]; then
+    if [ -n "${inference_config}" ]; then
+        inference_tag="$(basename "${inference_config}" .yaml)"
+    else
+        inference_tag=inference
+    fi
+    # Add overwritten arg's info
+    if [ -n "${asr_inference_args}" ]; then
+        inference_tag+="$(echo "${asr_inference_args}" | sed -e "s/--/\_/g" -e "s/[ |=]//g")"
+    fi
+    if [ -n "${enh_inference_args}" ]; then
+        inference_tag+="$(echo "${enh_inference_args}" | sed -e "s/--/\_/g" -e "s/[ |=]//g")"
+    fi
+    if "${use_lm}"; then
+        inference_tag+="_lm_$(basename "${lm_exp}")_$(echo "${inference_lm}" | sed -e "s/\//_/g" -e "s/\.[^.]*$//g")"
+    fi
+    if "${use_ngram}"; then
+        inference_tag+="_ngram_$(basename "${ngram_exp}")_$(echo "${inference_ngram}" | sed -e "s/\//_/g" -e "s/\.[^.]*$//g")"
+    fi
+    inference_tag+="_enh_asr_model_$(echo "${inference_enh_asr_model}" | sed -e "s/\//_/g" -e "s/\.[^.]*$//g")"
+
+    if "${use_k2}"; then
+      inference_tag+="_use_k2"
+    fi
+fi
+
+if [ -z "${inference_enh_tag}" ]; then
+    if [ -n "${inference_enh_config}" ]; then
+        inference_enh_tag="$(basename "${inference_enh_config}" .yaml)"
+    else
+        inference_enh_tag=enhanced
+    fi
+fi
+
+# ========================== Main stages start from here. ==========================
+
+if ! "${skip_data_prep}"; then
+    if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+        log "Stage 1: Data preparation for data/${train_set}, data/${valid_set}, etc."
+        # [Task dependent] Need to create data.sh for new corpus
+        local/data.sh ${local_data_opts}
+    fi
+
+    if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+        if [ -n "${speed_perturb_factors}" ]; then
+           log "Stage 2: Speed perturbation: data/${train_set} -> data/${train_set}_sp"
+
+            _scp_list="wav.scp "
+            for i in $(seq ${spk_num}); do
+                _scp_list+="spk${i}.scp "
+            done
+
+           for factor in ${speed_perturb_factors}; do
+               if [[ $(bc <<<"${factor} != 1.0") == 1 ]]; then
+                   scripts/utils/perturb_enh_data_dir_speed.sh --utt_extra_files "${utt_extra_files}" "${factor}" "data/${train_set}" "data/${train_set}_sp${factor}" "${_scp_list}"
+                   _dirs+="data/${train_set}_sp${factor} "
+               else
+                   # If speed factor is 1, same as the original
+                   _dirs+="data/${train_set} "
+               fi
+           done
+           utils/combine_data.sh --extra-files "${_scp_list}" "data/${train_set}_sp" ${_dirs}
+           for extra_file in ${utt_extra_files}; do
+               python pyscripts/utils/remove_duplicate_keys.py data/"${train_set}_sp"/${extra_file} > data/"${train_set}_sp"/${extra_file}.tmp
+               mv data/"${train_set}_sp"/${extra_file}.tmp data/"${train_set}_sp"/${extra_file}
+            done
+        else
+           log "Skip stage 2: Speed perturbation"
+        fi
+    fi
+
+    if [ -n "${speed_perturb_factors}" ]; then
+        train_set="${train_set}_sp"
+    fi
+
+    if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+        if [ "${feats_type}" = raw ]; then
+            log "Stage 3: Format wav.scp: data/ -> ${data_feats}"
+
+            # ====== Recreating "wav.scp" ======
+            # Kaldi-wav.scp, which can describe the file path with unix-pipe, like "cat /some/path |",
+            # shouldn't be used in training process.
+            # "format_wav_scp.sh" dumps such pipe-style-wav to real audio file
+            # and it can also change the audio-format and sampling rate.
+            # If nothing is need, then format_wav_scp.sh does nothing:
+            # i.e. the input file format and rate is same as the output.
+
+            for dset in "${train_set}" "${valid_set}" ${test_sets}; do
+                if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
+                    _suf="/org"
+                else
+                    _suf=""
+                fi
+                utils/copy_data_dir.sh --validate_opts --non-print data/"${dset}" "${data_feats}${_suf}/${dset}"
+
+                # expand the utt_extra_files for multi-references
+                expand_utt_extra_files=""
+                for extra_file in ${utt_extra_files}; do
+                    # with regex to suuport multi-references
+                    for single_file in "data/${dset}/${extra_file}"*; do
+                        if [ ! -f "${single_file}" ]; then
+                            continue
+                        fi
+                        cp ${single_file} "${data_feats}${_suf}/${dset}"
+                        expand_utt_extra_files="${expand_utt_extra_files} $(basename ${single_file})"
+                    done
+                done
+                echo "${expand_utt_extra_files}"
+
+                rm -f ${data_feats}${_suf}/${dset}/{segments,wav.scp,reco2file_and_channel,reco2dur}
+                _opts=
+                if [ -e data/"${dset}"/segments ]; then
+                    # "segments" is used for splitting wav files which are written in "wav".scp
+                    # into utterances. The file format of segments:
+                    #   <segment_id> <record_id> <start_time> <end_time>
+                    #   "e.g. call-861225-A-0050-0065 call-861225-A 5.0 6.5"
+                    # Where the time is written in seconds.
+                    _opts+="--segments data/${dset}/segments "
+                fi
+
+                _spk_list=" "
+                for i in $(seq ${spk_num}); do
+                    _spk_list+="spk${i} "
+                done
+                if $use_noise_ref && [ -n "${_suf}" ]; then
+                    # references for denoising ("noise1 noise2 ... niose${noise_type_num} ")
+                    _spk_list+=$(for n in $(seq $noise_type_num); do echo -n "noise$n "; done)
+                fi
+                if $use_dereverb_ref && [ -n "${_suf}" ]; then
+                    # references for dereverberation
+                    _spk_list+=$(for n in $(seq $dereverb_ref_num); do echo -n "dereverb$n "; done)
+                fi
+
+                for spk in ${_spk_list} "wav" ; do
+                    # shellcheck disable=SC2086
+                    scripts/audio/format_wav_scp.sh --nj "${nj}" --cmd "${train_cmd}" \
+                        --out-filename "${spk}.scp" \
+                        --ref_channels "${ref_channel}" \
+                        --audio-format "${audio_format}" --fs "${fs}" ${_opts} \
+                        "data/${dset}/${spk}.scp" "${data_feats}${_suf}/${dset}" \
+                        "${data_feats}${_suf}/${dset}/logs/${spk}" "${data_feats}${_suf}/${dset}/data/${spk}"
+
+                done
+
+                echo "${feats_type}" > "${data_feats}${_suf}/${dset}/feats_type"
+
+            done
+
+        else
+            log "Error: not supported: --feats_type ${feats_type}"
+            exit 2
+        fi
+    fi
+
+
+    if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+        log "Stage 4: Remove long/short data: ${data_feats}/org -> ${data_feats}"
+
+        # NOTE(kamo): Not applying to test_sets to keep original data
+        for dset in "${train_set}" "${valid_set}"; do
+
+            _spk_list=" "
+            _scp_list=" "
+            for i in $(seq ${spk_num}); do
+                _spk_list+="spk${i} "
+                _scp_list+="spk${i}.scp "
+            done
+            if $use_noise_ref; then
+                # references for denoising ("noise1 noise2 ... niose${noise_type_num} ")
+                _spk_list+=$(for n in $(seq $noise_type_num); do echo -n "noise$n "; done)
+                _scp_list+=$(for n in $(seq $noise_type_num); do echo -n "noise$n.scp "; done)
+            fi
+            if $use_dereverb_ref; then
+                # references for dereverberation
+                _spk_list+=$(for n in $(seq $dereverb_ref_num); do echo -n "dereverb$n "; done)
+                _scp_list+=$(for n in $(seq $dereverb_ref_num); do echo -n "dereverb$n.scp "; done)
+            fi
+
+            # Copy data dir
+            utils/copy_data_dir.sh --validate_opts --non-print "${data_feats}/org/${dset}" "${data_feats}/${dset}"
+            cp "${data_feats}/org/${dset}/feats_type" "${data_feats}/${dset}/feats_type"
+
+            for utt_extra_file in ${utt_extra_files}; do
+                if [ -f "${data_feats}/org/${dset}/${utt_extra_file}" ]; then
+                    cp "${data_feats}/org/${dset}/${utt_extra_file}" "${data_feats}/${dset}"
+                fi
+            done
+
+            # Remove short utterances
+            _feats_type="$(<${data_feats}/${dset}/feats_type)"
+            if [ "${_feats_type}" = raw ]; then
+                _fs=$(python3 -c "import humanfriendly as h;print(h.parse_size('${fs}'))")
+                _min_length=$(python3 -c "print(int(${min_wav_duration} * ${_fs}))")
+                _max_length=$(python3 -c "print(int(${max_wav_duration} * ${_fs}))")
+
+                # utt2num_samples is created by format_wav_scp.sh
+                <"${data_feats}/org/${dset}/utt2num_samples" \
+                    awk -v min_length="${_min_length}" -v max_length="${_max_length}" \
+                        '{ if ($2 > min_length && $2 < max_length ) print $0; }' \
+                        >"${data_feats}/${dset}/utt2num_samples"
+                for spk in ${_spk_list} "wav"; do
+                    <"${data_feats}/org/${dset}/${spk}.scp" \
+                        utils/filter_scp.pl "${data_feats}/${dset}/utt2num_samples"  \
+                        >"${data_feats}/${dset}/${spk}.scp"
+                done
+            else
+                # Get frame shift in ms from conf/fbank.conf
+                _frame_shift=
+                if [ -f conf/fbank.conf ] && [ "$(<conf/fbank.conf grep -c frame-shift)" -gt 0 ]; then
+                    # Assume using conf/fbank.conf for feature extraction
+                    _frame_shift="$(<conf/fbank.conf grep frame-shift | sed -e 's/[-a-z =]*\([0-9]*\)/\1/g')"
+                fi
+                if [ -z "${_frame_shift}" ]; then
+                    # If not existing, use the default number in Kaldi (=10ms).
+                    # If you are using different number, you have to change the following value manually.
+                    _frame_shift=10
+                fi
+
+                _min_length=$(python3 -c "print(int(${min_wav_duration} / ${_frame_shift} * 1000))")
+                _max_length=$(python3 -c "print(int(${max_wav_duration} / ${_frame_shift} * 1000))")
+
+                cp "${data_feats}/org/${dset}/feats_dim" "${data_feats}/${dset}/feats_dim"
+                <"${data_feats}/org/${dset}/feats_shape" awk -F, ' { print $1 } ' \
+                    | awk -v min_length="${_min_length}" -v max_length="${_max_length}" \
+                        '{ if ($2 > min_length && $2 < max_length) print $0; }' \
+                        >"${data_feats}/${dset}/feats_shape"
+                <"${data_feats}/org/${dset}/feats.scp" \
+                    utils/filter_scp.pl "${data_feats}/${dset}/feats_shape"  \
+                    >"${data_feats}/${dset}/feats.scp"
+            fi
+
+            # Remove empty text
+            <"${data_feats}/org/${dset}/text" \
+                awk ' { if( NF != 1 ) print $0; } ' >"${data_feats}/${dset}/text"
+
+            # fix_data_dir.sh leaves only utts which exist in all files
+            utils/fix_data_dir.sh --utt_extra_files "${utt_extra_files}" "${data_feats}/${dset}"
+        done
+
+        # shellcheck disable=SC2002
+        cat ${lm_train_text} | awk ' { if( NF != 1 ) print $0; } ' > "${data_feats}/lm_train.txt"
+    fi
+
+
+    if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+        if [ "${token_type}" = bpe ]; then
+            log "Stage 5: Generate token_list from ${bpe_train_text} using BPE"
+
+            mkdir -p "${bpedir}"
+            # shellcheck disable=SC2002
+            cat ${bpe_train_text} | cut -f 2- -d" "  > "${bpedir}"/train.txt
+
+            if [ -n "${bpe_nlsyms}" ]; then
+                _opts_spm="--user_defined_symbols=${bpe_nlsyms}"
+            else
+                _opts_spm=""
+            fi
+
+            spm_train \
+                --input="${bpedir}"/train.txt \
+                --vocab_size="${nbpe}" \
+                --model_type="${bpemode}" \
+                --model_prefix="${bpeprefix}" \
+                --character_coverage=${bpe_char_cover} \
+                --input_sentence_size="${bpe_input_sentence_size}" \
+                ${_opts_spm}
+
+            {
+            echo "${blank}"
+            echo "${oov}"
+            # Remove <unk>, <s>, </s> from the vocabulary
+            <"${bpeprefix}".vocab awk '{ if( NR != 1 && NR != 2 && NR != 3 ){ print $1; } }'
+            echo "${sos_eos}"
+            } > "${token_list}"
+
+        elif [ "${token_type}" = char ] || [ "${token_type}" = word ]; then
+            log "Stage 5: Generate character level token_list from ${lm_train_text}"
+
+            _opts="--non_linguistic_symbols ${nlsyms_txt}"
+
+            # The first symbol in token_list must be "<blank>" and the last must be also sos/eos:
+            # 0 is reserved for CTC-blank for ASR and also used as ignore-index in the other task
+            ${python} -m espnet2.bin.tokenize_text  \
+                --token_type "${token_type}" \
+                --input "${data_feats}/lm_train.txt" --output "${token_list}" ${_opts} \
+                --field 2- \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --write_vocabulary true \
+                --add_symbol "${blank}:0" \
+                --add_symbol "${oov}:1" \
+                --add_symbol "${sos_eos}:-1"
+
+        else
+            log "Error: not supported --token_type '${token_type}'"
+            exit 2
+        fi
+
+        # Create word-list for word-LM training
+        if ${use_word_lm} && [ "${token_type}" != word ]; then
+            log "Generate word level token_list from ${data_feats}/lm_train.txt"
+            ${python} -m espnet2.bin.tokenize_text \
+                --token_type word \
+                --input "${data_feats}/lm_train.txt" --output "${lm_token_list}" \
+                --field 2- \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --write_vocabulary true \
+                --vocabulary_size "${word_vocab_size}" \
+                --add_symbol "${blank}:0" \
+                --add_symbol "${oov}:1" \
+                --add_symbol "${sos_eos}:-1"
+        fi
+
+    fi
+else
+    log "Skip the stages for data preparation"
+fi
+
+
+# ========================== Data preparation is done here. ==========================
+
+
+if ! "${skip_train}"; then
+    if "${use_lm}"; then
+        if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+            log "Stage 6: LM collect stats: train_set=${data_feats}/lm_train.txt, dev_set=${lm_dev_text}"
+
+            _opts=
+            if [ -n "${lm_config}" ]; then
+                # To generate the config file: e.g.
+                #   % python3 -m espnet2.bin.lm_train --print_config --optim adam
+                _opts+="--config ${lm_config} "
+            fi
+
+            # 1. Split the key file
+            _logdir="${lm_stats_dir}/logdir"
+            mkdir -p "${_logdir}"
+            # Get the minimum number among ${nj} and the number lines of input files
+            _nj=$(min "${nj}" "$(<${data_feats}/lm_train.txt wc -l)" "$(<${lm_dev_text} wc -l)")
+
+            key_file="${data_feats}/lm_train.txt"
+            split_scps=""
+            for n in $(seq ${_nj}); do
+                split_scps+=" ${_logdir}/train.${n}.scp"
+            done
+            # shellcheck disable=SC2086
+            utils/split_scp.pl "${key_file}" ${split_scps}
+
+            key_file="${lm_dev_text}"
+            split_scps=""
+            for n in $(seq ${_nj}); do
+                split_scps+=" ${_logdir}/dev.${n}.scp"
+            done
+            # shellcheck disable=SC2086
+            utils/split_scp.pl "${key_file}" ${split_scps}
+
+            # 2. Generate run.sh
+            log "Generate '${lm_stats_dir}/run.sh'. You can resume the process from stage 6 using this script"
+            mkdir -p "${lm_stats_dir}"; echo "${run_args} --stage 6 \"\$@\"; exit \$?" > "${lm_stats_dir}/run.sh"; chmod +x "${lm_stats_dir}/run.sh"
+
+            # 3. Submit jobs
+            log "LM collect-stats started... log: '${_logdir}/stats.*.log'"
+            # NOTE: --*_shape_file doesn't require length information if --batch_type=unsorted,
+            #       but it's used only for deciding the sample ids.
+            # shellcheck disable=SC2086
+            ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \
+                ${python} -m espnet2.bin.lm_train \
+                    --collect_stats true \
+                    --use_preprocessor true \
+                    --bpemodel "${bpemodel}" \
+                    --token_type "${lm_token_type}"\
+                    --token_list "${lm_token_list}" \
+                    --non_linguistic_symbols "${nlsyms_txt}" \
+                    --cleaner "${cleaner}" \
+                    --g2p "${g2p}" \
+                    --train_data_path_and_name_and_type "${data_feats}/lm_train.txt,text,text" \
+                    --valid_data_path_and_name_and_type "${lm_dev_text},text,text" \
+                    --train_shape_file "${_logdir}/train.JOB.scp" \
+                    --valid_shape_file "${_logdir}/dev.JOB.scp" \
+                    --output_dir "${_logdir}/stats.JOB" \
+                    ${_opts} ${lm_args} || { cat "${_logdir}"/stats.1.log; exit 1; }
+
+            # 4. Aggregate shape files
+            _opts=
+            for i in $(seq "${_nj}"); do
+                _opts+="--input_dir ${_logdir}/stats.${i} "
+            done
+            # shellcheck disable=SC2086
+            ${python} -m espnet2.bin.aggregate_stats_dirs ${_opts} --output_dir "${lm_stats_dir}"
+
+            # Append the num-tokens at the last dimensions. This is used for batch-bins count
+            <"${lm_stats_dir}/train/text_shape" \
+                awk -v N="$(<${lm_token_list} wc -l)" '{ print $0 "," N }' \
+                >"${lm_stats_dir}/train/text_shape.${lm_token_type}"
+
+            <"${lm_stats_dir}/valid/text_shape" \
+                awk -v N="$(<${lm_token_list} wc -l)" '{ print $0 "," N }' \
+                >"${lm_stats_dir}/valid/text_shape.${lm_token_type}"
+        fi
+
+
+        if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+            log "Stage 7: LM Training: train_set=${data_feats}/lm_train.txt, dev_set=${lm_dev_text}"
+
+            _opts=
+            if [ -n "${lm_config}" ]; then
+                # To generate the config file: e.g.
+                #   % python3 -m espnet2.bin.lm_train --print_config --optim adam
+                _opts+="--config ${lm_config} "
+            fi
+
+            if [ "${num_splits_lm}" -gt 1 ]; then
+                # If you met a memory error when parsing text files, this option may help you.
+                # The corpus is split into subsets and each subset is used for training one by one in order,
+                # so the memory footprint can be limited to the memory required for each dataset.
+
+                _split_dir="${lm_stats_dir}/splits${num_splits_lm}"
+                if [ ! -f "${_split_dir}/.done" ]; then
+                    rm -f "${_split_dir}/.done"
+                    ${python} -m espnet2.bin.split_scps \
+                      --scps "${data_feats}/lm_train.txt" "${lm_stats_dir}/train/text_shape.${lm_token_type}" \
+                      --num_splits "${num_splits_lm}" \
+                      --output_dir "${_split_dir}"
+                    touch "${_split_dir}/.done"
+                else
+                    log "${_split_dir}/.done exists. Spliting is skipped"
+                fi
+
+                _opts+="--train_data_path_and_name_and_type ${_split_dir}/lm_train.txt,text,text "
+                _opts+="--train_shape_file ${_split_dir}/text_shape.${lm_token_type} "
+                _opts+="--multiple_iterator true "
+
+            else
+                _opts+="--train_data_path_and_name_and_type ${data_feats}/lm_train.txt,text,text "
+                _opts+="--train_shape_file ${lm_stats_dir}/train/text_shape.${lm_token_type} "
+            fi
+
+            # NOTE(kamo): --fold_length is used only if --batch_type=folded and it's ignored in the other case
+
+            log "Generate '${lm_exp}/run.sh'. You can resume the process from stage 7 using this script"
+            mkdir -p "${lm_exp}"; echo "${run_args} --stage 7 \"\$@\"; exit \$?" > "${lm_exp}/run.sh"; chmod +x "${lm_exp}/run.sh"
+
+            log "LM training started... log: '${lm_exp}/train.log'"
+            if echo "${cuda_cmd}" | grep -e queue.pl -e queue-freegpu.pl &> /dev/null; then
+                # SGE can't include "/" in a job name
+                jobname="$(basename ${lm_exp})"
+            else
+                jobname="${lm_exp}/train.log"
+            fi
+
+            # shellcheck disable=SC2086
+            ${python} -m espnet2.bin.launch \
+                --cmd "${cuda_cmd} --name ${jobname}" \
+                --log "${lm_exp}"/train.log \
+                --ngpu "${ngpu}" \
+                --num_nodes "${num_nodes}" \
+                --init_file_prefix "${lm_exp}"/.dist_init_ \
+                --multiprocessing_distributed true -- \
+                ${python} -m espnet2.bin.lm_train \
+                    --ngpu "${ngpu}" \
+                    --use_preprocessor true \
+                    --bpemodel "${bpemodel}" \
+                    --token_type "${lm_token_type}"\
+                    --token_list "${lm_token_list}" \
+                    --non_linguistic_symbols "${nlsyms_txt}" \
+                    --cleaner "${cleaner}" \
+                    --g2p "${g2p}" \
+                    --valid_data_path_and_name_and_type "${lm_dev_text},text,text" \
+                    --valid_shape_file "${lm_stats_dir}/valid/text_shape.${lm_token_type}" \
+                    --fold_length "${lm_fold_length}" \
+                    --resume true \
+                    --output_dir "${lm_exp}" \
+                    ${_opts} ${lm_args}
+
+        fi
+
+
+        if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
+            log "Stage 8: Calc perplexity: ${lm_test_text}"
+            _opts=
+            # TODO(kamo): Parallelize?
+            log "Perplexity calculation started... log: '${lm_exp}/perplexity_test/lm_calc_perplexity.log'"
+            # shellcheck disable=SC2086
+            ${cuda_cmd} --gpu "${ngpu}" "${lm_exp}"/perplexity_test/lm_calc_perplexity.log \
+                ${python} -m espnet2.bin.lm_calc_perplexity \
+                    --ngpu "${ngpu}" \
+                    --data_path_and_name_and_type "${lm_test_text},text,text" \
+                    --train_config "${lm_exp}"/config.yaml \
+                    --model_file "${lm_exp}/${inference_lm}" \
+                    --output_dir "${lm_exp}/perplexity_test" \
+                    ${_opts}
+            log "PPL: ${lm_test_text}: $(cat ${lm_exp}/perplexity_test/ppl)"
+
+        fi
+
+    else
+        log "Stage 6-8: Skip lm-related stages: use_lm=${use_lm}"
+    fi
+
+
+    if "${use_ngram}"; then
+        mkdir -p ${ngram_exp}
+    fi
+    if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
+        if "${use_ngram}"; then
+            log "Stage 9: Ngram Training: train_set=${data_feats}/lm_train.txt"
+            cut -f 2 -d " " ${data_feats}/lm_train.txt | lmplz -S "20%" --discount_fallback -o ${ngram_num} - >${ngram_exp}/${ngram_num}gram.arpa
+            build_binary -s ${ngram_exp}/${ngram_num}gram.arpa ${ngram_exp}/${ngram_num}gram.bin 
+        else
+            log "Stage 9: Skip ngram stages: use_ngram=${use_ngram}"
+        fi
+    fi
+
+
+    if [ ${stage} -le 10 ] && [ ${stop_stage} -ge 10 ]; then
+        _enh_asr_train_dir="${data_feats}/${train_set}"
+        _enh_asr_valid_dir="${data_feats}/${valid_set}"
+        log "Stage 10: ASR collect stats: train_set=${_enh_asr_train_dir}, valid_set=${_enh_asr_valid_dir}"
+
+        _opts=
+        if [ -n "${enh_asr_config}" ]; then
+            # To generate the config file: e.g.
+            #   % python3 -m espnet2.bin.enh_s2t_train --print_config --optim adam
+            _opts+="--config ${enh_asr_config} "
+        fi
+
+        _feats_type="$(<${_enh_asr_train_dir}/feats_type)"
+        if [ "${_feats_type}" = raw ]; then
+            _scp=wav.scp
+            if [[ "${audio_format}" == *ark* ]]; then
+                _type=kaldi_ark
+            else
+                # "sound" supports "wav", "flac", etc.
+                _type=sound
+            fi
+            _opts+="--frontend_conf fs=${fs} "
+        else
+            _scp=feats.scp
+            _type=kaldi_ark
+            _input_size="$(<${_enh_asr_train_dir}/feats_dim)"
+            _opts+="--input_size=${_input_size} "
+        fi
+
+        # 1. Split the key file
+        _logdir="${enh_asr_stats_dir}/logdir"
+        mkdir -p "${_logdir}"
+
+        # Get the minimum number among ${nj} and the number lines of input files
+        _nj=$(min "${nj}" "$(<${_enh_asr_train_dir}/${_scp} wc -l)" "$(<${_enh_asr_valid_dir}/${_scp} wc -l)")
+
+        key_file="${_enh_asr_train_dir}/${_scp}"
+        split_scps=""
+        for n in $(seq "${_nj}"); do
+            split_scps+=" ${_logdir}/train.${n}.scp"
+        done
+        # shellcheck disable=SC2086
+        utils/split_scp.pl "${key_file}" ${split_scps}
+
+        key_file="${_enh_asr_valid_dir}/${_scp}"
+        split_scps=""
+        for n in $(seq "${_nj}"); do
+            split_scps+=" ${_logdir}/valid.${n}.scp"
+        done
+        # shellcheck disable=SC2086
+        utils/split_scp.pl "${key_file}" ${split_scps}
+
+        # 2. Generate run.sh
+        log "Generate '${enh_asr_stats_dir}/run.sh'. You can resume the process from stage 10 using this script"
+        mkdir -p "${enh_asr_stats_dir}"; echo "${run_args} --stage 10 \"\$@\"; exit \$?" > "${enh_asr_stats_dir}/run.sh"; chmod +x "${enh_asr_stats_dir}/run.sh"
+
+        # 3. Submit jobs
+        log "ASR collect-stats started... log: '${_logdir}/stats.*.log'"
+
+        # NOTE: --*_shape_file doesn't require length information if --batch_type=unsorted,
+        #       but it's used only for deciding the sample ids.
+
+        # shellcheck disable=SC2086
+        ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \
+            ${python} -m espnet2.bin.enh_s2t_train \
+                --collect_stats true \
+                --use_preprocessor true \
+                --bpemodel "${bpemodel}" \
+                --token_type "${token_type}" \
+                --token_list "${token_list}" \
+                --non_linguistic_symbols "${nlsyms_txt}" \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --train_data_path_and_name_and_type "${_enh_asr_train_dir}/${_scp},speech,${_type}" \
+                --train_data_path_and_name_and_type "${_enh_asr_train_dir}/${_scp},speech_ref1,${_type}" \
+                --train_data_path_and_name_and_type "${_enh_asr_train_dir}/text,text,text" \
+                --valid_data_path_and_name_and_type "${_enh_asr_valid_dir}/${_scp},speech,${_type}" \
+                --valid_data_path_and_name_and_type "${_enh_asr_valid_dir}/${_scp},speech_ref1,${_type}" \
+                --valid_data_path_and_name_and_type "${_enh_asr_valid_dir}/text,text,text" \
+                --train_shape_file "${_logdir}/train.JOB.scp" \
+                --valid_shape_file "${_logdir}/valid.JOB.scp" \
+                --output_dir "${_logdir}/stats.JOB" \
+                ${_opts} ${enh_asr_args} || { cat "${_logdir}"/stats.1.log; exit 1; }
+
+        # 4. Aggregate shape files
+        _opts=
+        for i in $(seq "${_nj}"); do
+            _opts+="--input_dir ${_logdir}/stats.${i} "
+        done
+        # shellcheck disable=SC2086
+        ${python} -m espnet2.bin.aggregate_stats_dirs ${_opts} --output_dir "${enh_asr_stats_dir}"
+
+        # Append the num-tokens at the last dimensions. This is used for batch-bins count
+        <"${enh_asr_stats_dir}/train/text_shape" \
+            awk -v N="$(<${token_list} wc -l)" '{ print $0 "," N }' \
+            >"${enh_asr_stats_dir}/train/text_shape.${token_type}"
+
+        <"${enh_asr_stats_dir}/valid/text_shape" \
+            awk -v N="$(<${token_list} wc -l)" '{ print $0 "," N }' \
+            >"${enh_asr_stats_dir}/valid/text_shape.${token_type}"
+    fi
+
+
+    if [ ${stage} -le 11 ] && [ ${stop_stage} -ge 11 ]; then
+        _enh_asr_train_dir="${data_feats}/${train_set}"
+        _enh_asr_valid_dir="${data_feats}/${valid_set}"
+        log "Stage 11: ASR Training: train_set=${_enh_asr_train_dir}, valid_set=${_enh_asr_valid_dir}"
+
+        _opts=
+        if [ -n "${enh_asr_config}" ]; then
+            # To generate the config file: e.g.
+            #   % python3 -m espnet2.bin.enh_s2t_train --print_config --optim adam
+            _opts+="--config ${enh_asr_config} "
+        fi
+
+        _feats_type="$(<${_enh_asr_train_dir}/feats_type)"
+        if [ "${_feats_type}" = raw ]; then
+            _scp=wav.scp
+            # "sound" supports "wav", "flac", etc.
+            if [[ "${audio_format}" == *ark* ]]; then
+                _type=kaldi_ark
+            else
+                _type=sound
+            fi
+            _fold_length="$((enh_asr_speech_fold_length * 100))"
+            _opts+="--frontend_conf fs=${fs} "
+        else
+            _scp=feats.scp
+            _type=kaldi_ark
+            _fold_length="${enh_asr_speech_fold_length}"
+            _input_size="$(<${_enh_asr_train_dir}/feats_dim)"
+            _opts+="--input_size=${_input_size} "
+
+        fi
+        if [ "${feats_normalize}" = global_mvn ]; then
+            # Default normalization is utterance_mvn and changes to global_mvn
+            _opts+="--normalize=global_mvn --normalize_conf stats_file=${enh_asr_stats_dir}/train/feats_stats.npz "
+        fi
+
+        if [ "${num_splits_asr}" -gt 1 ]; then
+            # If you met a memory error when parsing text files, this option may help you.
+            # The corpus is split into subsets and each subset is used for training one by one in order,
+            # so the memory footprint can be limited to the memory required for each dataset.
+
+            _split_dir="${enh_asr_stats_dir}/splits${num_splits_asr}"
+            if [ ! -f "${_split_dir}/.done" ]; then
+                rm -f "${_split_dir}/.done"
+                ${python} -m espnet2.bin.split_scps \
+                  --scps \
+                      "${_enh_asr_train_dir}/${_scp}" \
+                      "${_enh_asr_train_dir}/text" \
+                      "${enh_asr_stats_dir}/train/speech_shape" \
+                      "${enh_asr_stats_dir}/train/speech_ref1_shape" \
+                      "${enh_asr_stats_dir}/train/text_shape.${token_type}" \
+                  --num_splits "${num_splits_asr}" \
+                  --output_dir "${_split_dir}"
+                touch "${_split_dir}/.done"
+            else
+                log "${_split_dir}/.done exists. Spliting is skipped"
+            fi
+
+            _opts+="--train_data_path_and_name_and_type ${_split_dir}/${_scp},speech,${_type} "
+            _opts+="--train_data_path_and_name_and_type ${_split_dir}/spk1.scp,speech_ref1,${_type} "
+            _opts+="--train_data_path_and_name_and_type ${_split_dir}/text,text,text "
+            _opts+="--train_shape_file ${_split_dir}/speech_shape "
+            _opts+="--train_shape_file ${_split_dir}/speech_ref1_shape "
+            _opts+="--train_shape_file ${_split_dir}/text_shape.${token_type} "
+            _opts+="--multiple_iterator true "
+
+        else
+            _opts+="--train_data_path_and_name_and_type ${_enh_asr_train_dir}/${_scp},speech,${_type} "
+            _opts+="--train_data_path_and_name_and_type ${_enh_asr_train_dir}/spk1.scp,speech_ref1,${_type} "
+            _opts+="--train_data_path_and_name_and_type ${_enh_asr_train_dir}/text,text,text "
+            _opts+="--train_shape_file ${enh_asr_stats_dir}/train/speech_shape "
+            _opts+="--train_shape_file ${enh_asr_stats_dir}/train/speech_ref1_shape "
+            _opts+="--train_shape_file ${enh_asr_stats_dir}/train/text_shape.${token_type} "
+        fi
+
+        log "Generate '${enh_asr_exp}/run.sh'. You can resume the process from stage 11 using this script"
+        mkdir -p "${enh_asr_exp}"; echo "${run_args} --stage 11 \"\$@\"; exit \$?" > "${enh_asr_exp}/run.sh"; chmod +x "${enh_asr_exp}/run.sh"
+
+        # NOTE(kamo): --fold_length is used only if --batch_type=folded and it's ignored in the other case
+        log "ASR training started... log: '${enh_asr_exp}/train.log'"
+        if echo "${cuda_cmd}" | grep -e queue.pl -e queue-freegpu.pl &> /dev/null; then
+            # SGE can't include "/" in a job name
+            jobname="$(basename ${enh_asr_exp})"
+        else
+            jobname="${enh_asr_exp}/train.log"
+        fi
+
+        # shellcheck disable=SC2086
+        ${python} -m espnet2.bin.launch \
+            --cmd "${cuda_cmd} --name ${jobname}" \
+            --log "${enh_asr_exp}"/train.log \
+            --ngpu "${ngpu}" \
+            --num_nodes "${num_nodes}" \
+            --init_file_prefix "${enh_asr_exp}"/.dist_init_ \
+            --multiprocessing_distributed true -- \
+            ${python} -m espnet2.bin.enh_s2t_train \
+                --use_preprocessor true \
+                --bpemodel "${bpemodel}" \
+                --token_type "${token_type}" \
+                --token_list "${token_list}" \
+                --non_linguistic_symbols "${nlsyms_txt}" \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --valid_data_path_and_name_and_type "${_enh_asr_valid_dir}/${_scp},speech,${_type}" \
+                --valid_data_path_and_name_and_type "${_enh_asr_valid_dir}/spk1.scp,speech_ref1,${_type}" \
+                --valid_data_path_and_name_and_type "${_enh_asr_valid_dir}/text,text,text" \
+                --valid_shape_file "${enh_asr_stats_dir}/valid/speech_shape" \
+                --valid_shape_file "${enh_asr_stats_dir}/valid/speech_ref1_shape" \
+                --valid_shape_file "${enh_asr_stats_dir}/valid/text_shape.${token_type}" \
+                --resume true \
+                --init_param ${pretrained_model} \
+                --ignore_init_mismatch ${ignore_init_mismatch} \
+                --fold_length "${_fold_length}" \
+                --fold_length "${_fold_length}" \
+                --fold_length "${enh_asr_text_fold_length}" \
+                --output_dir "${enh_asr_exp}" \
+                ${_opts} ${enh_asr_args}
+
+    fi
+else
+    log "Skip the training stages"
+fi
+
+
+if [ -n "${download_model}" ]; then
+    log "Use ${download_model} for decoding and evaluation"
+    enh_asr_exp="${expdir}/${download_model}"
+    mkdir -p "${enh_asr_exp}"
+
+    # If the model already exists, you can skip downloading
+    espnet_model_zoo_download --unpack true "${download_model}" > "${enh_asr_exp}/config.txt"
+
+    # Get the path of each file
+    _enh_asr_model_file=$(<"${enh_asr_exp}/config.txt" sed -e "s/.*'enh_s2t_model_file': '\([^']*\)'.*$/\1/")
+    _enh_asr_train_config=$(<"${enh_asr_exp}/config.txt" sed -e "s/.*'enh_s2t_train_config': '\([^']*\)'.*$/\1/")
+
+    # Create symbolic links
+    ln -sf "${_enh_asr_model_file}" "${enh_asr_exp}"
+    ln -sf "${_enh_asr_train_config}" "${enh_asr_exp}"
+    inference_enh_asr_model=$(basename "${_enh_asr_model_file}")
+
+    if [ "$(<${enh_asr_exp}/config.txt grep -c lm_file)" -gt 0 ]; then
+        _lm_file=$(<"${enh_asr_exp}/config.txt" sed -e "s/.*'lm_file': '\([^']*\)'.*$/\1/")
+        _lm_train_config=$(<"${enh_asr_exp}/config.txt" sed -e "s/.*'lm_train_config': '\([^']*\)'.*$/\1/")
+
+        lm_exp="${expdir}/${download_model}/lm"
+        mkdir -p "${lm_exp}"
+
+        ln -sf "${_lm_file}" "${lm_exp}"
+        ln -sf "${_lm_train_config}" "${lm_exp}"
+        inference_lm=$(basename "${_lm_file}")
+    fi
+
+fi
+
+
+if ! "${skip_eval}"; then
+    if [ ${stage} -le 12 ] && [ ${stop_stage} -ge 12 ]; then
+        log "Stage 12: Decoding: training_dir=${enh_asr_exp}"
+
+        if ${gpu_inference}; then
+            _cmd="${cuda_cmd}"
+            _ngpu=1
+        else
+            _cmd="${decode_cmd}"
+            _ngpu=0
+        fi
+
+        _opts=
+        if [ -n "${inference_config}" ]; then
+            _opts+="--config ${inference_config} "
+        fi
+        if "${use_lm}"; then
+            if "${use_word_lm}"; then
+                _opts+="--word_lm_train_config ${lm_exp}/config.yaml "
+                _opts+="--word_lm_file ${lm_exp}/${inference_lm} "
+            else
+                _opts+="--lm_train_config ${lm_exp}/config.yaml "
+                _opts+="--lm_file ${lm_exp}/${inference_lm} "
+            fi
+        fi
+        if "${use_ngram}"; then
+             _opts+="--ngram_file ${ngram_exp}/${inference_ngram}"
+        fi
+
+        # 2. Generate run.sh
+        log "Generate '${enh_asr_exp}/${inference_tag}/run.sh'. You can resume the process from stage 12 using this script"
+        mkdir -p "${enh_asr_exp}/${inference_tag}"; echo "${run_args} --stage 12 \"\$@\"; exit \$?" > "${enh_asr_exp}/${inference_tag}/run.sh"; chmod +x "${enh_asr_exp}/${inference_tag}/run.sh"
+
+        for dset in ${test_sets}; do
+            _data="${data_feats}/${dset}"
+            _dir="${enh_asr_exp}/${inference_tag}/${dset}"
+            _logdir="${_dir}/logdir"
+            mkdir -p "${_logdir}"
+
+            _feats_type="$(<${_data}/feats_type)"
+            if [ "${_feats_type}" = raw ]; then
+                _scp=wav.scp
+                if [[ "${audio_format}" == *ark* ]]; then
+                    _type=kaldi_ark
+                else
+                    _type=sound
+                fi
+            else
+                _scp=feats.scp
+                _type=kaldi_ark
+            fi
+
+            # 1. Split the key file
+            key_file=${_data}/${_scp}
+            split_scps=""
+            _nj=$(min "${inference_nj}" "$(<${key_file} wc -l)")
+            asr_inference_tool="espnet2.bin.asr_inference"
+
+            for n in $(seq "${_nj}"); do
+                split_scps+=" ${_logdir}/keys.${n}.scp"
+            done
+            # shellcheck disable=SC2086
+            utils/split_scp.pl "${key_file}" ${split_scps}
+
+            # 2. Submit decoding jobs
+            log "Decoding started... log: '${_logdir}/asr_inference.*.log'"
+            # shellcheck disable=SC2086
+            ${_cmd} --gpu "${_ngpu}" JOB=1:"${_nj}" "${_logdir}"/asr_inference.JOB.log \
+                ${python} -m ${asr_inference_tool} \
+                    --enh_s2t_task true \
+                    --batch_size ${batch_size} \
+                    --ngpu "${_ngpu}" \
+                    --data_path_and_name_and_type "${_data}/${_scp},speech,${_type}" \
+                    --key_file "${_logdir}"/keys.JOB.scp \
+                    --asr_train_config "${enh_asr_exp}"/config.yaml \
+                    --asr_model_file "${enh_asr_exp}"/"${inference_enh_asr_model}" \
+                    --output_dir "${_logdir}"/output.JOB \
+                    ${_opts} ${asr_inference_args}
+
+            # 3. Concatenates the output files from each jobs
+            for f in token token_int score text; do
+                for i in $(seq "${_nj}"); do
+                    cat "${_logdir}/output.${i}/1best_recog/${f}"
+                done | LC_ALL=C sort -k1 >"${_dir}/${f}"
+            done
+
+        done
+    fi
+
+
+    if [ ${stage} -le 13 ] && [ ${stop_stage} -ge 13 ]; then
+        log "Stage 13: Enhance Speech: training_dir=${enh_asr_exp}"
+
+        if ${gpu_inference}; then
+            _cmd="${cuda_cmd}"
+            _ngpu=1
+        else
+            _cmd="${decode_cmd}"
+            _ngpu=0
+        fi
+
+        _opts=
+
+        # 2. Generate run.sh
+        log "Generate '${enh_asr_exp}/run_enhance.sh'. You can resume the process from stage 13 using this script"
+        mkdir -p "${enh_asr_exp}"; echo "${run_args} --stage 13 \"\$@\"; exit \$?" > "${enh_asr_exp}/run_enhance.sh"; chmod +x "${enh_asr_exp}/run_enhance.sh"
+
+        for dset in ${test_sets}; do
+            _data="${data_feats}/${dset}"
+            _dir="${enh_asr_exp}/${inference_enh_tag}_${dset}"
+            _logdir="${_dir}/logdir"
+            mkdir -p "${_logdir}"
+
+            _scp=wav.scp
+            if [[ "${audio_format}" == *ark* ]]; then
+                _type=kaldi_ark
+            else
+                _type=sound
+            fi
+
+            # 1. Split the key file
+            key_file=${_data}/${_scp}
+            split_scps=""
+            _nj=$(min "${inference_nj}" "$(<${key_file} wc -l)")
+            for n in $(seq "${_nj}"); do
+                split_scps+=" ${_logdir}/keys.${n}.scp"
+            done
+            # shellcheck disable=SC2086
+            utils/split_scp.pl "${key_file}" ${split_scps}
+
+            # 2. Submit inference jobs
+            log "Enhancement started... log: '${_logdir}/enh_inference.*.log'"
+            # shellcheck disable=SC2086
+            ${_cmd} --gpu "${_ngpu}" JOB=1:"${_nj}" "${_logdir}"/enh_inference.JOB.log \
+                ${python} -m espnet2.bin.enh_inference \
+                    --enh_s2t_task true \
+                    --ngpu "${_ngpu}" \
+                    --fs "${fs}" \
+                    --data_path_and_name_and_type "${_data}/${_scp},speech_mix,${_type}" \
+                    --key_file "${_logdir}"/keys.JOB.scp \
+                    --train_config "${enh_asr_exp}"/config.yaml \
+                    ${inference_enh_config:+--inference_config "$inference_enh_config"} \
+                    --model_file "${enh_asr_exp}"/"${inference_enh_asr_model}" \
+                    --output_dir "${_logdir}"/output.JOB \
+                    ${_opts} ${enh_inference_args}
+
+            # 3. Concatenates the output files from each jobs
+            _spk_list=" "
+            for i in $(seq ${spk_num}); do
+                _spk_list+="spk${i} "
+            done
+
+            for spk in ${_spk_list}; do
+                for i in $(seq "${_nj}"); do
+                    cat "${_logdir}/output.${i}/${spk}.scp"
+                done | LC_ALL=C sort -k1 > "${_dir}/${spk}.scp"
+            done
+        done
+    fi
+
+    if [ ${stage} -le 14 ] && [ ${stop_stage} -ge 14 ]; then
+        log "Stage 14: Scoring ASR"
+        if [ "${token_type}" = phn ]; then
+            log "Error: Not implemented for token_type=phn"
+            exit 1
+        fi
+
+        for dset in ${test_sets}; do
+            _data="${data_feats}/${dset}"
+            _dir="${enh_asr_exp}/${inference_tag}/${dset}"
+
+            for _type in cer wer ter; do
+                [ "${_type}" = ter ] && [ ! -f "${bpemodel}" ] && continue
+
+                _scoredir="${_dir}/score_${_type}"
+                mkdir -p "${_scoredir}"
+
+                if [ "${_type}" = wer ]; then
+                    # Tokenize text to word level
+                    paste \
+                        <(<"${_data}/text" \
+                              ${python} -m espnet2.bin.tokenize_text  \
+                                  -f 2- --input - --output - \
+                                  --token_type word \
+                                  --non_linguistic_symbols "${nlsyms_txt}" \
+                                  --remove_non_linguistic_symbols true \
+                                  --cleaner "${cleaner}" \
+                                  ) \
+                        <(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \
+                            >"${_scoredir}/ref.trn"
+
+                    # NOTE(kamo): Don't use cleaner for hyp
+                    paste \
+                        <(<"${_dir}/text"  \
+                              ${python} -m espnet2.bin.tokenize_text  \
+                                  -f 2- --input - --output - \
+                                  --token_type word \
+                                  --non_linguistic_symbols "${nlsyms_txt}" \
+                                  --remove_non_linguistic_symbols true \
+                                  ) \
+                        <(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \
+                            >"${_scoredir}/hyp.trn"
+
+
+                elif [ "${_type}" = cer ]; then
+                    # Tokenize text to char level
+                    paste \
+                        <(<"${_data}/text" \
+                              ${python} -m espnet2.bin.tokenize_text  \
+                                  -f 2- --input - --output - \
+                                  --token_type char \
+                                  --non_linguistic_symbols "${nlsyms_txt}" \
+                                  --remove_non_linguistic_symbols true \
+                                  --cleaner "${cleaner}" \
+                                  ) \
+                        <(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \
+                            >"${_scoredir}/ref.trn"
+
+                    # NOTE(kamo): Don't use cleaner for hyp
+                    paste \
+                        <(<"${_dir}/text"  \
+                              ${python} -m espnet2.bin.tokenize_text  \
+                                  -f 2- --input - --output - \
+                                  --token_type char \
+                                  --non_linguistic_symbols "${nlsyms_txt}" \
+                                  --remove_non_linguistic_symbols true \
+                                  ) \
+                        <(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \
+                            >"${_scoredir}/hyp.trn"
+
+                elif [ "${_type}" = ter ]; then
+                    # Tokenize text using BPE
+                    paste \
+                        <(<"${_data}/text" \
+                              ${python} -m espnet2.bin.tokenize_text  \
+                                  -f 2- --input - --output - \
+                                  --token_type bpe \
+                                  --bpemodel "${bpemodel}" \
+                                  --cleaner "${cleaner}" \
+                                ) \
+                        <(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \
+                            >"${_scoredir}/ref.trn"
+
+                    # NOTE(kamo): Don't use cleaner for hyp
+                    paste \
+                        <(<"${_dir}/text" \
+                              ${python} -m espnet2.bin.tokenize_text  \
+                                  -f 2- --input - --output - \
+                                  --token_type bpe \
+                                  --bpemodel "${bpemodel}" \
+                                  ) \
+                        <(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \
+                            >"${_scoredir}/hyp.trn"
+
+                fi
+
+                sclite \
+            ${score_opts} \
+                    -r "${_scoredir}/ref.trn" trn \
+                    -h "${_scoredir}/hyp.trn" trn \
+                    -i rm -o all stdout > "${_scoredir}/result.txt"
+
+                log "Write ${_type} result in ${_scoredir}/result.txt"
+                grep -e Avg -e SPKR -m 2 "${_scoredir}/result.txt"
+            done
+        done
+
+        [ -f local/score.sh ] && local/score.sh ${local_score_opts} "${enh_asr_exp}"
+
+        # Show results in Markdown syntax
+        scripts/utils/show_asr_result.sh "${enh_asr_exp}" > "${enh_asr_exp}"/RESULTS.md
+        cat "${enh_asr_exp}"/RESULTS.md
+
+    fi
+
+    if [ ${stage} -le 15 ] && [ ${stop_stage} -ge 15 ]; then
+        log "Stage 15: Scoring Enhancement"
+        _cmd=${decode_cmd}
+
+        # score_obs=true: Scoring for observation signal
+        # score_obs=false: Scoring for enhanced signal
+        # for score_obs in true false; do
+        for score_obs in true false; do
+            # Peform only at the first time for observation
+            if "${score_obs}" && [ -e "${data_feats}/RESULTS_enh.md" ]; then
+                log "${data_feats}/RESULTS_enh.md already exists. The scoring for observation will be skipped"
+                continue
+            fi
+
+            for dset in ${test_sets}; do
+                _data="${data_feats}/${dset}"
+                if "${score_obs}"; then
+                    _dir="${data_feats}/${dset}/scoring"
+                else
+                    _dir="${enh_asr_exp}/${inference_enh_tag}_${dset}/scoring"
+                fi
+
+                _logdir="${_dir}/logdir"
+                mkdir -p "${_logdir}"
+
+                # 1. Split the key file
+                key_file=${_data}/wav.scp
+                split_scps=""
+                _nj=$(min "${inference_nj}" "$(<${key_file} wc -l)")
+                for n in $(seq "${_nj}"); do
+                    split_scps+=" ${_logdir}/keys.${n}.scp"
+                done
+                # shellcheck disable=SC2086
+                utils/split_scp.pl "${key_file}" ${split_scps}
+
+                _ref_scp=
+                for spk in $(seq "${spk_num}"); do
+                    _ref_scp+="--ref_scp ${_data}/spk${spk}.scp "
+                done
+                _inf_scp=
+                for spk in $(seq "${spk_num}"); do
+                    if "${score_obs}"; then
+                        # To compute the score of observation, input original wav.scp
+                        _inf_scp+="--inf_scp ${data_feats}/${dset}/wav.scp "
+                    else
+                        _inf_scp+="--inf_scp ${enh_asr_exp}/${inference_enh_tag}_${dset}/spk${spk}.scp "
+                    fi
+                done
+
+                # 2. Submit scoring jobs
+                log "Scoring started... log: '${_logdir}/enh_scoring.*.log'"
+                # shellcheck disable=SC2086
+                ${_cmd} JOB=1:"${_nj}" "${_logdir}"/enh_scoring.JOB.log \
+                    ${python} -m espnet2.bin.enh_scoring \
+                        --key_file "${_logdir}"/keys.JOB.scp \
+                        --output_dir "${_logdir}"/output.JOB \
+                        ${_ref_scp} \
+                        ${_inf_scp} \
+                        --ref_channel ${ref_channel}
+
+                for spk in $(seq "${spk_num}"); do
+                    for protocol in ${scoring_protocol} wav; do
+                        for i in $(seq "${_nj}"); do
+                            cat "${_logdir}/output.${i}/${protocol}_spk${spk}"
+                        done | LC_ALL=C sort -k1 > "${_dir}/${protocol}_spk${spk}"
+                    done
+                done
+
+
+                for protocol in ${scoring_protocol}; do
+                    # shellcheck disable=SC2046
+                    paste $(for j in $(seq ${spk_num}); do echo "${_dir}"/"${protocol}"_spk"${j}" ; done)  |
+                    awk 'BEGIN{sum=0}
+                        {n=0;score=0;for (i=2; i<=NF; i+=2){n+=1;score+=$i}; sum+=score/n}
+                        END{printf ("%.2f\n",sum/NR)}' > "${_dir}/result_${protocol,,}.txt"
+                done
+            done
+
+            ./scripts/utils/show_enh_score.sh "${_dir}/../.." > "${_dir}/../../RESULTS_enh.md"
+        done
+        log "Evaluation result for observation: ${data_feats}/RESULTS_enh.md"
+        log "Evaluation result for enhancement: ${enh_asr_exp}/RESULTS_enh.md"
+
+    fi
+else
+    log "Skip the evaluation stages"
+fi
+
+
+packed_model="${enh_asr_exp}/${enh_asr_exp##*/}_${inference_enh_asr_model%.*}.zip"
+if ! "${skip_upload_hf}"; then
+    if [ ${stage} -le 16 ] && [ ${stop_stage} -ge 16 ]; then
+        log "Stage 16: Pack model: ${packed_model}"
+
+        _opts=
+        if "${use_lm}"; then
+            _opts+="--lm_train_config ${lm_exp}/config.yaml "
+            _opts+="--lm_file ${lm_exp}/${inference_lm} "
+            _opts+="--option ${lm_exp}/perplexity_test/ppl "
+            _opts+="--option ${lm_exp}/images "
+        fi
+        if [ "${feats_normalize}" = global_mvn ]; then
+            _opts+="--option ${enh_asr_stats_dir}/train/feats_stats.npz "
+        fi
+        if [ "${token_type}" = bpe ]; then
+            _opts+="--option ${bpemodel} "
+        fi
+        if [ "${nlsyms_txt}" != none ]; then
+            _opts+="--option ${nlsyms_txt} "
+        fi
+        # shellcheck disable=SC2086
+        ${python} -m espnet2.bin.pack enh_s2t \
+            --enh_s2t_train_config "${enh_asr_exp}"/config.yaml \
+            --enh_s2t_model_file "${enh_asr_exp}"/"${inference_enh_asr_model}" \
+            ${_opts} \
+            --option "${enh_asr_exp}"/RESULTS.md \
+            --option "${enh_asr_exp}"/RESULTS_enh.md \
+            --option "${enh_asr_exp}"/images \
+            --outpath "${packed_model}"
+    fi
+
+
+    if [ ${stage} -le 17 ] && [ ${stop_stage} -ge 17 ]; then
+        [ -z "${hf_repo}" ] && \
+            log "ERROR: You need to setup the variable hf_repo with the name of the repository located at HuggingFace" && \
+            exit 1
+        log "Stage 17: Upload model to HuggingFace: ${hf_repo}"
+
+        gitlfs=$(git lfs --version 2> /dev/null || true)
+        [ -z "${gitlfs}" ] && \
+            log "ERROR: You need to install git-lfs first" && \
+            exit 1
+
+        dir_repo=${expdir}/hf_${hf_repo//"/"/"_"}
+        [ ! -d "${dir_repo}" ] && git clone https://huggingface.co/${hf_repo} ${dir_repo}
+
+        if command -v git &> /dev/null; then
+            _creator_name="$(git config user.name)"
+            _checkout="git checkout $(git show -s --format=%H)"
+        else
+            _creator_name="$(whoami)"
+            _checkout=""
+        fi
+        # /some/where/espnet/egs2/foo/asr1/ -> foo/asr1
+        _task="$(pwd | rev | cut -d/ -f2 | rev)"
+        # foo/asr1 -> foo
+        _corpus="${_task%/*}"
+        _model_name="${_creator_name}/${_corpus}_$(basename ${packed_model} .zip)"
+
+        # copy files in ${dir_repo}
+        unzip -o ${packed_model} -d ${dir_repo}
+        # Generate description file
+        # shellcheck disable=SC2034
+        hf_task=speech-enhancement-recognition
+        # shellcheck disable=SC2034     
+        espnet_task=EnhS2T
+        # shellcheck disable=SC2034
+        task_exp=${enh_asr_exp}
+        eval "echo \"$(cat scripts/utils/TEMPLATE_HF_Readme.md)\"" > "${dir_repo}"/README.md
+
+        this_folder=${PWD}
+        cd ${dir_repo}
+        if [ -n "$(git status --porcelain)" ]; then
+            git add .
+            git commit -m "Update model"
+        fi
+        git push
+        cd ${this_folder}
+    fi
+else
+    log "Skip the uploading stages"
+fi
+
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/TEMPLATE/enh_asr1/local/path.sh b/egs2/TEMPLATE/enh_asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/TEMPLATE/enh_asr1/path.sh b/egs2/TEMPLATE/enh_asr1/path.sh
new file mode 100755
index 00000000000..d2b90a67653
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/path.sh
@@ -0,0 +1,22 @@
+MAIN_ROOT=$PWD/../../..
+KALDI_ROOT=$MAIN_ROOT/tools/kaldi
+
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+
+. "${MAIN_ROOT}"/tools/activate_python.sh && . "${MAIN_ROOT}"/tools/extra_path.sh
+export PATH=$MAIN_ROOT/utils:$MAIN_ROOT/espnet/bin:$PATH
+
+export OMP_NUM_THREADS=1
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+
+# You need to change or unset NCCL_SOCKET_IFNAME according to your network environment
+# https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/env.html#nccl-socket-ifname
+export NCCL_SOCKET_IFNAME="^lo,docker,virbr,vmnet,vboxnet"
+
+# NOTE(kamo): Source at the last to overwrite the setting
+. local/path.sh
diff --git a/egs2/TEMPLATE/enh_asr1/pyscripts b/egs2/TEMPLATE/enh_asr1/pyscripts
new file mode 120000
index 00000000000..90e7cf60b04
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/pyscripts
@@ -0,0 +1 @@
+../asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_asr1/scripts/audio b/egs2/TEMPLATE/enh_asr1/scripts/audio
new file mode 120000
index 00000000000..836e57dcd1d
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/scripts/audio
@@ -0,0 +1 @@
+../../enh1/scripts/audio
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_asr1/scripts/feats b/egs2/TEMPLATE/enh_asr1/scripts/feats
new file mode 120000
index 00000000000..8b492e66782
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/scripts/feats
@@ -0,0 +1 @@
+../../asr1/scripts/feats
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_asr1/scripts/utils/TEMPLATE_HF_Readme.md b/egs2/TEMPLATE/enh_asr1/scripts/utils/TEMPLATE_HF_Readme.md
new file mode 120000
index 00000000000..137c5c9044a
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/scripts/utils/TEMPLATE_HF_Readme.md
@@ -0,0 +1 @@
+../../../asr1/scripts/utils/TEMPLATE_HF_Readme.md
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_asr1/scripts/utils/TEMPLATE_Readme.md b/egs2/TEMPLATE/enh_asr1/scripts/utils/TEMPLATE_Readme.md
new file mode 120000
index 00000000000..3479c7ee724
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/scripts/utils/TEMPLATE_Readme.md
@@ -0,0 +1 @@
+../../../asr1/scripts/utils/TEMPLATE_Readme.md
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_asr1/scripts/utils/create_README_file.py b/egs2/TEMPLATE/enh_asr1/scripts/utils/create_README_file.py
new file mode 120000
index 00000000000..0fe3405603d
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/scripts/utils/create_README_file.py
@@ -0,0 +1 @@
+../../../asr1/scripts/utils/create_README_file.py
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_asr1/scripts/utils/download_from_google_drive.sh b/egs2/TEMPLATE/enh_asr1/scripts/utils/download_from_google_drive.sh
new file mode 120000
index 00000000000..b3c560c573c
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/scripts/utils/download_from_google_drive.sh
@@ -0,0 +1 @@
+../../../../../utils/download_from_google_drive.sh
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_asr1/scripts/utils/evaluate_asr.sh b/egs2/TEMPLATE/enh_asr1/scripts/utils/evaluate_asr.sh
new file mode 120000
index 00000000000..cccf5bf788b
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/scripts/utils/evaluate_asr.sh
@@ -0,0 +1 @@
+../../../asr1/scripts/utils/evaluate_asr.sh
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_asr1/scripts/utils/get_model_names.py b/egs2/TEMPLATE/enh_asr1/scripts/utils/get_model_names.py
new file mode 120000
index 00000000000..b163314a6c5
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/scripts/utils/get_model_names.py
@@ -0,0 +1 @@
+../../../asr1/scripts/utils/get_model_names.py
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_asr1/scripts/utils/perturb_enh_data_dir_speed.sh b/egs2/TEMPLATE/enh_asr1/scripts/utils/perturb_enh_data_dir_speed.sh
new file mode 120000
index 00000000000..0896188f3a1
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/scripts/utils/perturb_enh_data_dir_speed.sh
@@ -0,0 +1 @@
+../../../enh1/scripts/utils/perturb_enh_data_dir_speed.sh
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_asr1/scripts/utils/show_asr_result.sh b/egs2/TEMPLATE/enh_asr1/scripts/utils/show_asr_result.sh
new file mode 120000
index 00000000000..ea34b243f2c
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/scripts/utils/show_asr_result.sh
@@ -0,0 +1 @@
+../../../asr1/scripts/utils/show_asr_result.sh
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_asr1/scripts/utils/show_enh_score.sh b/egs2/TEMPLATE/enh_asr1/scripts/utils/show_enh_score.sh
new file mode 100755
index 00000000000..e135d73f91f
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/scripts/utils/show_enh_score.sh
@@ -0,0 +1,84 @@
+#!/usr/bin/env bash
+mindepth=0
+maxdepth=1
+
+. utils/parse_options.sh
+
+if [ $# -gt 1 ]; then
+    echo "Usage: $0 --mindepth 0 --maxdepth 1 [exp]" 1>&2
+    echo ""
+    echo "Show the system environments and the evaluation results in Markdown format."
+    echo 'The default of <exp> is "exp/".'
+    exit 1
+fi
+
+[ -f ./path.sh ] && . ./path.sh
+set -euo pipefail
+if [ $# -eq 1 ]; then
+    exp=$(realpath "$1")
+else
+    exp=exp
+fi
+
+
+cat << EOF
+<!-- Generated by $0 -->
+# RESULTS
+## Environments
+- date: \`$(LC_ALL=C date)\`
+EOF
+
+python3 << EOF
+import sys, espnet, torch
+pyversion = sys.version.replace('\n', ' ')
+
+print(f"""- python version: \`{pyversion}\`
+- espnet version: \`espnet {espnet.__version__}\`
+- pytorch version: \`pytorch {torch.__version__}\`""")
+EOF
+
+cat << EOF
+- Git hash: \`$(git rev-parse HEAD)\`
+  - Commit date: \`$(git log -1 --format='%cd')\`
+
+EOF
+
+
+while IFS= read -r expdir; do
+    if ls "${expdir}"/*/scoring_enh/result_stoi.txt &> /dev/null; then
+        echo -e "\n## $(basename ${expdir})\n"
+        [ -e "${expdir}"/config.yaml ] && grep ^config "${expdir}"/config.yaml
+        metrics=()
+        heading="\n|dataset|"
+        sep="|---|"
+        for type in pesq estoi stoi sar sdr sir si_snr; do
+            if ls "${expdir}"/*/scoring_enh/result_${type}.txt &> /dev/null; then
+                metrics+=("$type")
+                heading+="${type^^}|"
+                sep+="---|"
+            fi
+        done
+        echo -e "${heading}\n${sep}"
+
+        setnames=()
+        for dirname in "${expdir}"/*/scoring_enh/result_stoi.txt; do
+            dset=$(echo $dirname | sed -e "s#${expdir}/\([^/]*\)/scoring_enh/result_stoi.txt#\1#g")
+            setnames+=("$dset")
+        done
+        for dset in "${setnames[@]}"; do
+            line="|${dset}|"
+            for ((i=0; i<${#metrics[@]}; i++)); do
+                type=${metrics[$i]}
+                if [ -f "${expdir}"/${dset}/scoring_enh/result_${type}.txt ]; then
+                    score=$(head -n1 "${expdir}"/${dset}/scoring_enh/result_${type}.txt)
+                else
+                    score=""
+                fi
+                line+="${score}|"
+            done
+            echo $line
+        done
+        echo ""
+    fi
+
+done < <(find ${exp} -mindepth ${mindepth} -maxdepth ${maxdepth} -type d)
diff --git a/egs2/TEMPLATE/enh_asr1/scripts/utils/upload_models_to_hub.sh b/egs2/TEMPLATE/enh_asr1/scripts/utils/upload_models_to_hub.sh
new file mode 120000
index 00000000000..aeae4732e4b
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/scripts/utils/upload_models_to_hub.sh
@@ -0,0 +1 @@
+../../../asr1/scripts/utils/upload_models_to_hub.sh
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_asr1/setup.sh b/egs2/TEMPLATE/enh_asr1/setup.sh
new file mode 100755
index 00000000000..36799ce4a13
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/setup.sh
@@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+help_message=$(cat << EOF
+Usage: $0 <target-dir>
+EOF
+)
+
+
+if [ $# -ne 1 ]; then
+    log "${help_message}"
+    log "Error: 1 positional argument is required."
+    exit 2
+fi
+
+
+dir=$1
+mkdir -p "${dir}"
+
+if [ ! -d "${dir}"/../../TEMPLATE ]; then
+    log "Error: ${dir}/../../TEMPLATE should exist. You may specify wrong directory."
+    exit 1
+fi
+
+targets=""
+
+# Copy
+for f in cmd.sh conf local; do
+    target="${dir}"/../../TEMPLATE/enh_asr1/"${f}"
+    cp -r "${target}" "${dir}"
+    targets+="${dir}/${target} "
+done
+
+
+# Symlinks to TEMPLATE
+for f in enh_asr.sh path.sh db.sh scripts pyscripts; do
+    target=../../TEMPLATE/enh_asr1/"${f}"
+    ln -sf "${target}" "${dir}"
+    targets+="${dir}/${target} "
+done
+
+
+# Symlinks to Kaldi
+for f in steps utils; do
+    target=../../../tools/kaldi/egs/wsj/s5/"${f}"
+    ln -sf "${target}" "${dir}"
+    targets+="${dir}/${target} "
+done
+
+log "Created: ${targets}"
diff --git a/egs2/TEMPLATE/enh_asr1/steps b/egs2/TEMPLATE/enh_asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_asr1/utils b/egs2/TEMPLATE/enh_asr1/utils
new file mode 120000
index 00000000000..6d93948f170
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils/
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_st1/cmd.sh b/egs2/TEMPLATE/enh_st1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/TEMPLATE/enh_st1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/TEMPLATE/enh_st1/conf/fbank.conf b/egs2/TEMPLATE/enh_st1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/TEMPLATE/enh_st1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/TEMPLATE/enh_st1/conf/pbs.conf b/egs2/TEMPLATE/enh_st1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/TEMPLATE/enh_st1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/TEMPLATE/enh_st1/conf/pitch.conf b/egs2/TEMPLATE/enh_st1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/TEMPLATE/enh_st1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/TEMPLATE/enh_st1/conf/queue.conf b/egs2/TEMPLATE/enh_st1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/TEMPLATE/enh_st1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/TEMPLATE/enh_st1/conf/slurm.conf b/egs2/TEMPLATE/enh_st1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/TEMPLATE/enh_st1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/TEMPLATE/enh_st1/db.sh b/egs2/TEMPLATE/enh_st1/db.sh
new file mode 120000
index 00000000000..318d781d123
--- /dev/null
+++ b/egs2/TEMPLATE/enh_st1/db.sh
@@ -0,0 +1 @@
+../asr1/db.sh
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_st1/enh_st.sh b/egs2/TEMPLATE/enh_st1/enh_st.sh
new file mode 100755
index 00000000000..eabf49cc29d
--- /dev/null
+++ b/egs2/TEMPLATE/enh_st1/enh_st.sh
@@ -0,0 +1,1819 @@
+#!/usr/bin/env bash
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+min() {
+  local a b
+  a=$1
+  for b in "$@"; do
+      if [ "${b}" -le "${a}" ]; then
+          a="${b}"
+      fi
+  done
+  echo "${a}"
+}
+SECONDS=0
+
+# General configuration
+stage=1              # Processes starts from the specified stage.
+stop_stage=10000     # Processes is stopped at the specified stage.
+skip_data_prep=false # Skip data preparation stages.
+skip_train=false     # Skip training stages.
+skip_eval=false      # Skip decoding and evaluation stages.
+skip_upload_hf=true  # Skip uploading to hugging face stages.
+ngpu=1               # The number of gpus ("0" uses cpu, otherwise use gpu).
+num_nodes=1          # The number of nodes.
+nj=32                # The number of parallel jobs.
+inference_nj=32      # The number of parallel jobs in decoding.
+gpu_inference=false  # Whether to perform gpu decoding.
+dumpdir=dump         # Directory to dump features.
+expdir=exp           # Directory to save experiments.
+python=python3       # Specify python to execute espnet commands.
+
+# Data preparation related
+local_data_opts= # The options given to local/data.sh.
+
+# Speed perturbation related
+speed_perturb_factors=  # perturbation factors, e.g. "0.9 1.0 1.1" (separated by space).
+
+# Feature extraction related
+feats_type=raw       # Feature type (raw or fbank_pitch).
+audio_format=flac    # Audio format: wav, flac, wav.ark, flac.ark  (only in feats_type=raw).
+fs=16k               # Sampling rate.
+min_wav_duration=0.1 # Minimum duration in second.
+max_wav_duration=20  # Maximum duration in second.
+
+# Tokenization related
+oov="<unk>"         # Out of vocabulary symbol.
+blank="<blank>"     # CTC blank symbol
+sos_eos="<sos/eos>" # sos and eos symbole
+token_joint=false       # whether to use a single bpe system for both source and target languages
+src_case=lc.rm
+src_token_type=bpe      # Tokenization type (char or bpe) for source languages.
+src_nbpe=30             # The number of BPE vocabulary for source language.
+src_bpemode=unigram     # Mode of BPE for source language (unigram or bpe).
+src_bpe_input_sentence_size=100000000 # Size of input sentence for BPE for source language.
+src_bpe_nlsyms=         # non-linguistic symbols list, separated by a comma, for BPE of source language
+src_bpe_char_cover=1.0  # character coverage when modeling BPE for source language
+tgt_case=tc
+tgt_token_type=bpe      # Tokenization type (char or bpe) for target language.
+tgt_nbpe=30             # The number of BPE vocabulary for target language.
+tgt_bpemode=unigram     # Mode of BPE (unigram or bpe) for target language.
+tgt_bpe_input_sentence_size=100000000 # Size of input sentence for BPE for target language.
+tgt_bpe_nlsyms=         # non-linguistic symbols list, separated by a comma, for BPE for target language.
+tgt_bpe_char_cover=1.0  # character coverage when modeling BPE for target language.
+
+# Ngram model related
+use_ngram=false
+ngram_exp=
+ngram_num=3
+
+# Language model related
+use_lm=true       # Use language model for ST decoding.
+lm_tag=           # Suffix to the result dir for language model training.
+lm_exp=           # Specify the directory path for LM experiment.
+                  # If this option is specified, lm_tag is ignored.
+lm_stats_dir=     # Specify the directory path for LM statistics.
+lm_config=        # Config for language model training.
+lm_args=          # Arguments for language model training, e.g., "--max_epoch 10".
+                  # Note that it will overwrite args in lm config.
+use_word_lm=false # Whether to use word language model.
+num_splits_lm=1   # Number of splitting for lm corpus.
+# shellcheck disable=SC2034
+word_vocab_size=10000 # Size of word vocabulary.
+
+# ST model related
+enh_st_tag=        # Suffix to the result dir for st model training.
+enh_st_exp=        # Specify the directory path for ST experiment.
+               # If this option is specified, enh_st_tag is ignored.
+enh_st_stats_dir=  # Specify the directory path for ST statistics.
+enh_st_config=     # Config for st model training.
+enh_st_args=       # Arguments for st model training, e.g., "--max_epoch 10".
+                   # Note that it will overwrite args in st config.
+pretrained_model=          # Pretrained model to load
+ignore_init_mismatch=false # Ignore initial mismatch
+feats_normalize=global_mvn # Normalizaton layer type.
+num_splits_st=1            # Number of splitting for lm corpus.
+src_lang=es                # source language abbrev. id (e.g., es)
+tgt_lang=en                # target language abbrev. id (e.g., en)
+
+# Upload model related
+hf_repo=
+
+# Decoding related
+use_k2=false        # Whether to use k2 based decoder
+batch_size=1
+inference_tag=      # Suffix to the result dir for decoding.
+inference_config=   # Config for decoding.
+st_inference_args=  # Arguments for decoding, e.g., "--lm_weight 0.1".
+                    # Note that it will overwrite args in inference config.
+enh_inference_args="--normalize_output_wav true"
+inference_lm=valid.loss.ave.pth       # Language model path for decoding.
+inference_ngram=${ngram_num}gram.bin
+inference_enh_st_model=valid.acc.ave.pth # ST model path for decoding.
+                                      # e.g.
+                                      # inference_enh_st_model=train.loss.best.pth
+                                      # inference_enh_st_model=3epoch.pth
+                                      # inference_enh_st_model=valid.acc.best.pth
+                                      # inference_enh_st_model=valid.loss.ave.pth
+download_model= # Download a model from Model Zoo and use it for decoding.
+
+# Enhancement related arguments
+spk_num=1   # Number of speakers
+noise_type_num=1
+dereverb_ref_num=1
+# Evaluation related
+enh_inference_args="--normalize_output_wav true"
+scoring_protocol="STOI SDR SAR SIR SI_SNR"
+ref_channel=0
+inference_enh_tag=      # Prefix to the result dir for ENH inference.
+inference_enh_config=   # Config for enhancement.
+
+# Enh Training data related
+use_dereverb_ref=false
+use_noise_ref=false
+
+# [Task dependent] Set the datadir name created by local/data.sh
+train_set=       # Name of training set.
+valid_set=       # Name of validation set used for monitoring/tuning network training.
+test_sets=       # Names of test sets. Multiple items (e.g., both dev and eval sets) can be specified.
+src_bpe_train_text=  # Text file path of bpe training set for source language.
+tgt_bpe_train_text=  # Text file path of bpe training set for target language.
+lm_train_text=   # Text file path of language model training set.
+lm_dev_text=     # Text file path of language model development set.
+lm_test_text=    # Text file path of language model evaluation set.
+nlsyms_txt=none  # Non-linguistic symbol list if existing.
+cleaner=none     # Text cleaner.
+g2p=none         # g2p method (needed if token_type=phn).
+lang=noinfo      # The language type of corpus.
+enh_st_speech_fold_length=800 # fold_length for speech data during ST training.
+enh_st_text_fold_length=150   # fold_length for text data during ST training.
+lm_fold_length=150         # fold_length for LM training.
+
+help_message=$(cat << EOF
+Usage: $0 --train-set "<train_set_name>" --valid-set "<valid_set_name>" --test_sets "<test_set_names>"
+
+Options:
+    # General configuration
+    --stage          # Processes starts from the specified stage (default="${stage}").
+    --stop_stage     # Processes is stopped at the specified stage (default="${stop_stage}").
+    --skip_data_prep # Skip data preparation stages (default="${skip_data_prep}").
+    --skip_train     # Skip training stages (default="${skip_train}").
+    --skip_eval      # Skip decoding and evaluation stages (default="${skip_eval}").
+    --skip_upload_hf    # Skip packing and uploading stages (default="${skip_upload_hf}").
+    --ngpu           # The number of gpus ("0" uses cpu, otherwise use gpu, default="${ngpu}").
+    --num_nodes      # The number of nodes (default="${num_nodes}").
+    --nj             # The number of parallel jobs (default="${nj}").
+    --inference_nj   # The number of parallel jobs in decoding (default="${inference_nj}").
+    --gpu_inference  # Whether to perform gpu decoding (default="${gpu_inference}").
+    --dumpdir        # Directory to dump features (default="${dumpdir}").
+    --expdir         # Directory to save experiments (default="${expdir}").
+    --python         # Specify python to execute espnet commands (default="${python}").
+
+    # Data preparation related
+    --local_data_opts # The options given to local/data.sh (default="${local_data_opts}").
+
+    # Speed perturbation related
+    --speed_perturb_factors # speed perturbation factors, e.g. "0.9 1.0 1.1" (separated by space, default="${speed_perturb_factors}").
+
+    # Feature extraction related
+    --feats_type       # Feature type (raw, fbank_pitch or extracted, default="${feats_type}").
+    --audio_format     # Audio format: wav, flac, wav.ark, flac.ark  (only in feats_type=raw, default="${audio_format}").
+    --fs               # Sampling rate (default="${fs}").
+    --min_wav_duration # Minimum duration in second (default="${min_wav_duration}").
+    --max_wav_duration # Maximum duration in second (default="${max_wav_duration}").
+
+    # Tokenization related
+    --oov                     # Out of vocabulary symbol (default="${oov}").
+    --blank                   # CTC blank symbol (default="${blank}").
+    --sos_eos                 # sos and eos symbole (default="${sos_eos}").
+    --token_joint=false       # Whether to use a single bpe system for both source and target languages.
+                              # if set as true, will use tgt_* for processing (default="${token_joint}").
+    --src_token_type=bpe      # Tokenization type (char or bpe) for source languages. (default="${src_token_type}").
+    --src_nbpe=30             # The number of BPE vocabulary for source language. (default="${src_nbpe}").
+    --src_bpemode=unigram     # Mode of BPE for source language (unigram or bpe). (default="${src_bpemode}").
+    --src_bpe_input_sentence_size=100000000 # Size of input sentence for BPE for source language. (default="${src_bpe_input_sentence_size}").
+    --src_bpe_nlsyms=         # Non-linguistic symbols list, separated by a comma, for BPE of source language. (default="${src_bpe_nlsyms}").
+    --src_bpe_char_cover=1.0  # Character coverage when modeling BPE for source language. (default="${src_bpe_char_cover}").
+    --tgt_token_type=bpe      # Tokenization type (char or bpe) for target language. (default="${tgt_token_type}").
+    --tgt_nbpe=30             # The number of BPE vocabulary for target language. (default="${tgt_nbpe}").
+    --tgt_bpemode=unigram     # Mode of BPE (unigram or bpe) for target language. (default="${tgt_bpemode}").
+    --tgt_bpe_input_sentence_size=100000000 # Size of input sentence for BPE for target language. (default="${tgt_bpe_input_sentence_size}").
+    --tgt_bpe_nlsyms=         # Non-linguistic symbols list, separated by a comma, for BPE for target language. (default="${tgt_bpe_nlsyms}").
+    --tgt_bpe_char_cover=1.0  # Character coverage when modeling BPE for target language. (default="${tgt_bpe_char_cover}").
+
+    # Language model related
+    --lm_tag          # Suffix to the result dir for language model training (default="${lm_tag}").
+    --lm_exp          # Specify the directory path for LM experiment.
+                      # If this option is specified, lm_tag is ignored (default="${lm_exp}").
+    --lm_stats_dir    # Specify the directory path for LM statistics (default="${lm_stats_dir}").
+    --lm_config       # Config for language model training (default="${lm_config}").
+    --lm_args         # Arguments for language model training (default="${lm_args}").
+                      # e.g., --lm_args "--max_epoch 10"
+                      # Note that it will overwrite args in lm config.
+    --use_word_lm     # Whether to use word language model (default="${use_word_lm}").
+    --word_vocab_size # Size of word vocabulary (default="${word_vocab_size}").
+    --num_splits_lm   # Number of splitting for lm corpus (default="${num_splits_lm}").
+
+    # ST model related
+    --enh_st_tag           # Suffix to the result dir for st model training (default="${enh_st_tag}").
+    --enh_st_exp           # Specify the directory path for ST experiment.
+                       # If this option is specified, enh_st_tag is ignored (default="${enh_st_exp}").
+    --enh_st_stats_dir     # Specify the directory path for ST statistics (default="${enh_st_stats_dir}").
+    --enh_st_config        # Config for st model training (default="${enh_st_config}").
+    --enh_st_args          # Arguments for st model training (default="${enh_st_args}").
+                           # e.g., --enh_st_args "--max_epoch 10"
+                           # Note that it will overwrite args in st config.
+    --pretrained_model=          # Pretrained model to load (default="${pretrained_model}").
+    --ignore_init_mismatch=      # Ignore mismatch parameter init with pretrained model (default="${ignore_init_mismatch}").
+    --feats_normalize  # Normalizaton layer type. (default="${feats_normalize}").
+    --num_splits_st    # Number of splitting for lm corpus.  (default="${num_splits_st}").
+    --src_lang=        # source language abbrev. id (e.g., es). (default="${src_lang}")
+    --tgt_lang=        # target language abbrev. id (e.g., en). (default="${tgt_lang}")
+
+    # Decoding related
+    --inference_tag       # Suffix to the result dir for decoding (default="${inference_tag}").
+    --inference_config    # Config for decoding (default="${inference_config}").
+    --st_inference_args   # Arguments for decoding (default="${st_inference_args}").
+                          # e.g., --st_inference_args "--lm_weight 0.1"
+                          # Note that it will overwrite args in inference config.
+    --enh_inference_args     # Arguments for enhancement (default="${enh_inference_args}").
+    --inference_lm        # Language model path for decoding (default="${inference_lm}").
+    --inference_enh_st_model # ST model path for decoding (default="${inference_enh_st_model}").
+    --download_model      # Download a model from Model Zoo and use it for decoding (default="${download_model}").
+
+    --spk_num             # number of speakers
+    --noise_type_num   # Number of noise types in the input audio (default="${noise_type_num}")
+    --dereverb_ref_num # Number of references for dereverberation (default="${dereverb_ref_num}")
+    --use_dereverb_ref # Whether or not to use dereverberated signal as an additional reference
+                         for training a dereverberation model (default="${use_dereverb_ref}")
+    --use_noise_ref    # Whether or not to use noise signal as an additional reference
+                         for training a denoising model (default="${use_noise_ref}")
+    # Enhancement Evaluation related
+    --scoring_protocol    # Metrics to be used for scoring (default="${scoring_protocol}")
+    --ref_channel         # Reference channel of the reference speech will be used if the model
+                            output is single-channel and reference speech is multi-channel
+                            (default="${ref_channel}")
+    # [Task dependent] Set the datadir name created by local/data.sh
+    --train_set     # Name of training set (required).
+    --valid_set     # Name of validation set used for monitoring/tuning network training (required).
+    --test_sets     # Names of test sets.
+                    # Multiple items (e.g., both dev and eval sets) can be specified (required).
+    --src_bpe_train_text # Text file path of bpe training set for source language.
+    --tgt_bpe_train_text # Text file path of bpe training set for target language
+    --lm_train_text  # Text file path of language model training set.
+    --lm_dev_text   # Text file path of language model development set (default="${lm_dev_text}").
+    --lm_test_text  # Text file path of language model evaluation set (default="${lm_test_text}").
+    --nlsyms_txt    # Non-linguistic symbol list if existing (default="${nlsyms_txt}").
+    --cleaner       # Text cleaner (default="${cleaner}").
+    --g2p           # g2p method (default="${g2p}").
+    --lang          # The language type of corpus (default=${lang}).
+    --score_opts             # The options given to sclite scoring (default="{score_opts}").
+    --local_score_opts       # The options given to local/score.sh (default="{local_score_opts}").
+    --enh_st_speech_fold_length # fold_length for speech data during ST training (default="${enh_st_speech_fold_length}").
+    --enh_st_text_fold_length   # fold_length for text data during ST training (default="${enh_st_text_fold_length}").
+    --lm_fold_length         # fold_length for LM training (default="${lm_fold_length}").
+EOF
+)
+
+log "$0 $*"
+# Save command line args for logging (they will be lost after utils/parse_options.sh)
+run_args=$(pyscripts/utils/print_args.py $0 "$@")
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    log "${help_message}"
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+. ./path.sh
+. ./cmd.sh
+
+
+# Check required arguments
+[ -z "${train_set}" ] && { log "${help_message}"; log "Error: --train_set is required"; exit 2; };
+[ -z "${valid_set}" ] && { log "${help_message}"; log "Error: --valid_set is required"; exit 2; };
+[ -z "${test_sets}" ] && { log "${help_message}"; log "Error: --test_sets is required"; exit 2; };
+
+[ ${spk_num} -gt 1 ] && { log "${help_message}"; log "Error: --spk_num only 1 is supported"; exit 2; };
+
+# Check feature type
+if [ "${feats_type}" = raw ]; then
+    data_feats=${dumpdir}/raw
+elif [ "${feats_type}" = fbank_pitch ]; then
+    data_feats=${dumpdir}/fbank_pitch
+elif [ "${feats_type}" = fbank ]; then
+    data_feats=${dumpdir}/fbank
+elif [ "${feats_type}" == extracted ]; then
+    data_feats=${dumpdir}/extracted
+else
+    log "${help_message}"
+    log "Error: not supported: --feats_type ${feats_type}"
+    exit 2
+fi
+
+# Extra files for translation process
+utt_extra_files="text.${src_case}.${src_lang} text.${tgt_case}.${tgt_lang}"
+# Extra files for enhancement process
+utt_extra_files+=" utt2category"
+# Use the same text as ST for bpe training if not specified.
+[ -z "${src_bpe_train_text}" ] && src_bpe_train_text="${data_feats}/${train_set}/text.${src_case}.${src_lang}"
+[ -z "${tgt_bpe_train_text}" ] && tgt_bpe_train_text="${data_feats}/${train_set}/text.${tgt_case}.${tgt_lang}"
+# Use the same text as ST for lm training if not specified.
+[ -z "${lm_train_text}" ] && lm_train_text="${data_feats}/${train_set}/text.${tgt_case}.${tgt_lang}"
+# Use the same text as ST for lm training if not specified.
+[ -z "${lm_dev_text}" ] && lm_dev_text="${data_feats}/${valid_set}/text.${tgt_case}.${tgt_lang}"
+# Use the text of the 1st evaldir if lm_test is not specified
+[ -z "${lm_test_text}" ] && lm_test_text="${data_feats}/${test_sets%% *}/text.${tgt_case}.${tgt_lang}"
+
+# Check tokenization type
+if [ "${lang}" != noinfo ]; then
+    token_listdir=data/${lang}_token_list
+else
+    token_listdir=data/token_list
+fi
+# The tgt bpedir is set for all cases when using bpe
+tgt_bpedir="${token_listdir}/tgt_bpe_${tgt_bpemode}${tgt_nbpe}"
+tgt_bpeprefix="${tgt_bpedir}"/bpe
+tgt_bpemodel="${tgt_bpeprefix}".model
+tgt_bpetoken_list="${tgt_bpedir}"/tokens.txt
+tgt_chartoken_list="${token_listdir}"/char/tgt_tokens.txt
+if "${token_joint}"; then
+    # if token_joint, the bpe training will use both src_lang and tgt_lang to train a single bpe model
+    src_bpedir="${tgt_bpedir}"
+    src_bpeprefix="${tgt_bpeprefix}"
+    src_bpemodel="${tgt_bpemodel}"
+    src_bpetoken_list="${tgt_bpetoken_list}"
+    src_chartoken_list="${tgt_chartoken_list}"
+else
+    src_bpedir="${token_listdir}/src_bpe_${tgt_bpemode}${tgt_nbpe}"
+    src_bpeprefix="${src_bpedir}"/bpe
+    src_bpemodel="${src_bpeprefix}".model
+    src_bpetoken_list="${src_bpedir}"/tokens.txt
+    src_chartoken_list="${token_listdir}"/char/src_tokens.txt
+fi
+
+# NOTE: keep for future development.
+# shellcheck disable=SC2034
+tgt_wordtoken_list="${token_listdir}"/word/tgt_tokens.txt
+if "${token_joint}"; then
+    src_wordtoken_list="${tgt_wordtoken_list}"
+else
+    src_wordtoken_list="${token_listdir}"/word/src_tokens.txt
+fi
+
+# Set token types for src and tgt langs
+if [ "${src_token_type}" = bpe ]; then
+    src_token_list="${src_bpetoken_list}"
+elif [ "${src_token_type}" = char ]; then
+    src_token_list="${src_chartoken_list}"
+    src_bpemodel=none
+elif [ "${src_token_type}" = word ]; then
+    src_token_list="${src_wordtoken_list}"
+    src_bpemodel=none
+else
+    log "Error: not supported --src_token_type '${src_token_type}'"
+    exit 2
+fi
+if [ "${tgt_token_type}" = bpe ]; then
+    tgt_token_list="${tgt_bpetoken_list}"
+elif [ "${tgt_token_type}" = char ]; then
+    tgt_token_list="${tgt_chartoken_list}"
+    tgt_bpemodel=none
+elif [ "${tgt_token_type}" = word ]; then
+    tgt_token_list="${tgt_wordtoken_list}"
+    tgt_bpemodel=none
+else
+    log "Error: not supported --tgt_token_type '${tgt_token_type}'"
+    exit 2
+fi
+if ${use_word_lm}; then
+    log "Error: Word LM is not supported yet"
+    exit 2
+
+    lm_token_list="${tgt_wordtoken_list}"
+    lm_token_type=word
+else
+    lm_token_list="${tgt_token_list}"
+    lm_token_type="${tgt_token_type}"
+fi
+
+
+# Set tag for naming of model directory
+if [ -z "${enh_st_tag}" ]; then
+    if [ -n "${enh_st_config}" ]; then
+        enh_st_tag="$(basename "${enh_st_config}" .yaml)_${feats_type}"
+    else
+        enh_st_tag="train_${feats_type}"
+    fi
+    if [ "${lang}" != noinfo ]; then
+        enh_st_tag+="_${lang}_${tgt_token_type}_${tgt_case}"
+    else
+        enh_st_tag+="_${tgt_token_type}_${tgt_case}"
+    fi
+    if [ "${tgt_token_type}" = bpe ]; then
+        enh_st_tag+="${tgt_nbpe}"
+    fi
+    # Add overwritten arg's info
+    if [ -n "${enh_st_args}" ]; then
+        enh_st_tag+="$(echo "${enh_st_args}" | sed -e "s/--/\_/g" -e "s/[ |=/]//g")"
+    fi
+    if [ -n "${speed_perturb_factors}" ]; then
+        enh_st_tag+="_sp"
+    fi
+fi
+if [ -z "${lm_tag}" ]; then
+    if [ -n "${lm_config}" ]; then
+        lm_tag="$(basename "${lm_config}" .yaml)"
+    else
+        lm_tag="train"
+    fi
+    if [ "${lang}" != noinfo ]; then
+        lm_tag+="_${lang}_${lm_token_type}"
+    else
+        lm_tag+="_${lm_token_type}"
+    fi
+    if [ "${lm_token_type}" = bpe ]; then
+        lm_tag+="${tgt_nbpe}"
+    fi
+    # Add overwritten arg's info
+    if [ -n "${lm_args}" ]; then
+        lm_tag+="$(echo "${lm_args}" | sed -e "s/--/\_/g" -e "s/[ |=/]//g")"
+    fi
+fi
+
+# The directory used for collect-stats mode
+if [ -z "${enh_st_stats_dir}" ]; then
+    if [ "${lang}" != noinfo ]; then
+        enh_st_stats_dir="${expdir}/enh_st_stats_${feats_type}_${lang}_${tgt_token_type}"
+    else
+        enh_st_stats_dir="${expdir}/enh_st_stats_${feats_type}_${tgt_token_type}"
+    fi
+    if [ "${tgt_token_type}" = bpe ]; then
+        enh_st_stats_dir+="${tgt_nbpe}"
+    fi
+    if [ -n "${speed_perturb_factors}" ]; then
+        enh_st_stats_dir+="_sp"
+    fi
+fi
+if [ -z "${lm_stats_dir}" ]; then
+    if [ "${lang}" != noinfo ]; then
+        lm_stats_dir="${expdir}/lm_stats_${lang}_${lm_token_type}"
+    else
+        lm_stats_dir="${expdir}/lm_stats_${lm_token_type}"
+    fi
+    if [ "${lm_token_type}" = bpe ]; then
+        lm_stats_dir+="${tgt_nbpe}"
+    fi
+fi
+# The directory used for training commands
+if [ -z "${enh_st_exp}" ]; then
+    enh_st_exp="${expdir}/enh_st_${enh_st_tag}"
+fi
+if [ -z "${lm_exp}" ]; then
+    lm_exp="${expdir}/lm_${lm_tag}"
+fi
+if [ -z "${ngram_exp}" ]; then
+    ngram_exp="${expdir}/ngram"
+fi
+
+
+if [ -z "${inference_tag}" ]; then
+    if [ -n "${inference_config}" ]; then
+        inference_tag="$(basename "${inference_config}" .yaml)"
+    else
+        inference_tag=inference
+    fi
+    # Add overwritten arg's info
+    if [ -n "${st_inference_args}" ]; then
+        inference_tag+="$(echo "${st_inference_args}" | sed -e "s/--/\_/g" -e "s/[ |=]//g")"
+    fi
+    if [ -n "${enh_inference_args}" ]; then
+        inference_tag+="$(echo "${enh_inference_args}" | sed -e "s/--/\_/g" -e "s/[ |=]//g")"
+    fi
+    if "${use_lm}"; then
+        inference_tag+="_lm_$(basename "${lm_exp}")_$(echo "${inference_lm}" | sed -e "s/\//_/g" -e "s/\.[^.]*$//g")"
+    fi
+    if "${use_ngram}"; then
+        inference_tag+="_ngram_$(basename "${ngram_exp}")_$(echo "${inference_ngram}" | sed -e "s/\//_/g" -e "s/\.[^.]*$//g")"
+    fi
+    inference_tag+="_enh_st_model_$(echo "${inference_enh_st_model}" | sed -e "s/\//_/g" -e "s/\.[^.]*$//g")"
+
+    if "${use_k2}"; then
+      inference_tag+="_use_k2"
+    fi
+fi
+
+if [ -z "${inference_enh_tag}" ]; then
+    if [ -n "${inference_enh_config}" ]; then
+        inference_enh_tag="$(basename "${inference_enh_config}" .yaml)"
+    else
+        inference_enh_tag=enhanced
+    fi
+fi
+
+# ========================== Main stages start from here. ==========================
+
+if ! "${skip_data_prep}"; then
+    if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+        log "Stage 1: Data preparation for data/${train_set}, data/${valid_set}, etc."
+        # [Task dependent] Need to create data.sh for new corpus
+        local/data.sh ${local_data_opts}
+    fi
+
+    if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+        if [ -n "${speed_perturb_factors}" ]; then
+            log "Stage 2: Speed perturbation: data/${train_set} -> data/${train_set}_sp"
+
+            _scp_list="wav.scp "
+            for i in $(seq ${spk_num}); do
+                _scp_list+="spk${i}.scp "
+            done
+
+            for factor in ${speed_perturb_factors}; do
+                if [[ $(bc <<<"${factor} != 1.0") == 1 ]]; then
+                    scripts/utils/perturb_enh_data_dir_speed.sh --utt_extra_files "${utt_extra_files}" \
+                         "${factor}" "data/${train_set}" "data/${train_set}_sp${factor}" "${_scp_list}"
+                    _dirs+="data/${train_set}_sp${factor} "
+                else
+                    # If speed factor is 1, same as the original
+                    _dirs+="data/${train_set} "
+                fi
+            done
+            utils/combine_data.sh --extra_files "${utt_extra_files} ${_scp_list}" "data/${train_set}_sp" ${_dirs}
+            for extra_file in ${utt_extra_files}; do
+                python pyscripts/utils/remove_duplicate_keys.py data/"${train_set}_sp"/${extra_file} > data/"${train_set}_sp"/${extra_file}.tmp 
+                mv data/"${train_set}_sp"/${extra_file}.tmp data/"${train_set}_sp"/${extra_file}
+            done
+        else
+           log "Skip stage 2: Speed perturbation"
+        fi
+    fi
+
+    if [ -n "${speed_perturb_factors}" ]; then
+        train_set="${train_set}_sp"
+    fi
+
+    if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+        if [ "${feats_type}" = raw ]; then
+            log "Stage 3: Format wav.scp: data/ -> ${data_feats}"
+
+            # ====== Recreating "wav.scp" ======
+            # Kaldi-wav.scp, which can describe the file path with unix-pipe, like "cat /some/path |",
+            # shouldn't be used in training process.
+            # "format_wav_scp.sh" dumps such pipe-style-wav to real audio file
+            # and it can also change the audio-format and sampling rate.
+            # If nothing is need, then format_wav_scp.sh does nothing:
+            # i.e. the input file format and rate is same as the output.
+
+            for dset in "${train_set}" "${valid_set}" ${test_sets}; do
+                if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
+                    _suf="/org"
+                else
+                    _suf=""
+                fi
+                utils/copy_data_dir.sh --validate_opts --non-print data/"${dset}" "${data_feats}${_suf}/${dset}"
+
+                # expand the utt_extra_files for multi-references
+                expand_utt_extra_files=""
+                for extra_file in ${utt_extra_files}; do
+                    # with regex to suuport multi-references
+                    for single_file in "data/${dset}/${extra_file}"*; do
+                        if [ ! -f "${single_file}" ]; then
+                            continue
+                        fi
+                        cp ${single_file} "${data_feats}${_suf}/${dset}"
+                        expand_utt_extra_files="${expand_utt_extra_files} $(basename ${single_file})"
+                    done 
+                done
+                echo "${expand_utt_extra_files}"
+                utils/fix_data_dir.sh --utt_extra_files "${expand_utt_extra_files}" "${data_feats}${_suf}/${dset}"
+                for extra_file in ${expand_utt_extra_files}; do
+                    LC_ALL=C sort -u -k1,1 "${data_feats}${_suf}/${dset}/${extra_file}" -o "${data_feats}${_suf}/${dset}/${extra_file}"
+                done
+
+                rm -f ${data_feats}${_suf}/${dset}/{segments,wav.scp,reco2file_and_channel,reco2dur}
+                _opts=
+                if [ -e data/"${dset}"/segments ]; then
+                    # "segments" is used for splitting wav files which are written in "wav".scp
+                    # into utterances. The file format of segments:
+                    #   <segment_id> <record_id> <start_time> <end_time>
+                    #   "e.g. call-861225-A-0050-0065 call-861225-A 5.0 6.5"
+                    # Where the time is written in seconds.
+                    _opts+="--segments data/${dset}/segments "
+                fi
+
+                _spk_list=" "
+                for i in $(seq ${spk_num}); do
+                    _spk_list+="spk${i} "
+                done
+                if ${use_noise_ref} && [ -n "${_suf}" ]; then
+                    # references for denoising ("noise1 noise2 ... niose${noise_type_num} ")
+                    _spk_list+=$(for n in $(seq $noise_type_num); do echo -n "noise$n "; done)
+                fi
+                if ${use_dereverb_ref} && [ -n "${_suf}" ]; then
+                    # references for dereverberation
+                    _spk_list+=$(for n in $(seq $dereverb_ref_num); do echo -n "dereverb$n "; done)
+                fi
+
+                for spk in ${_spk_list} "wav" ; do
+                    # shellcheck disable=SC2086
+                    scripts/audio/format_wav_scp.sh --nj "${nj}" --cmd "${train_cmd}" \
+                        --out-filename "${spk}.scp" \
+                        --ref_channels "0" \
+                        --audio-format "${audio_format}" --fs "${fs}" ${_opts} \
+                        "data/${dset}/${spk}.scp" "${data_feats}${_suf}/${dset}" \
+                        "${data_feats}${_suf}/${dset}/logs/${spk}" "${data_feats}${_suf}/${dset}/data/${spk}"
+                done
+
+                echo "${feats_type}" > "${data_feats}${_suf}/${dset}/feats_type"
+            done
+
+        else
+            log "Error: not supported: --feats_type ${feats_type}"
+            exit 2
+        fi
+    fi
+
+
+    if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+        log "Stage 4: Remove long/short data: ${data_feats}/org -> ${data_feats}"
+
+        # NOTE(kamo): Not applying to test_sets to keep original data
+        for dset in "${train_set}" "${valid_set}"; do
+            # Copy data dir
+            _spk_list=" "
+            _scp_list=" "
+            for i in $(seq ${spk_num}); do
+                _spk_list+="spk${i} "
+                _scp_list+="spk${i}.scp "
+            done
+            if $use_noise_ref; then
+                # references for denoising ("noise1 noise2 ... niose${noise_type_num} ")
+                _spk_list+=$(for n in $(seq $noise_type_num); do echo -n "noise$n "; done)
+                _scp_list+=$(for n in $(seq $noise_type_num); do echo -n "noise$n.scp "; done)
+            fi
+            if $use_dereverb_ref; then
+                # references for dereverberation
+                _spk_list+=$(for n in $(seq $dereverb_ref_num); do echo -n "dereverb$n "; done)
+                _scp_list+=$(for n in $(seq $dereverb_ref_num); do echo -n "dereverb$n.scp "; done)
+            fi
+
+            # Copy data dir
+            utils/copy_data_dir.sh --validate_opts --non-print "${data_feats}/org/${dset}" "${data_feats}/${dset}"
+            cp "${data_feats}/org/${dset}/feats_type" "${data_feats}/${dset}/feats_type"
+
+            for utt_extra_file in ${utt_extra_files}; do
+                cp "${data_feats}/org/${dset}/${utt_extra_file}" "${data_feats}/${dset}"
+            done
+
+            # Remove short utterances
+            _feats_type="$(<${data_feats}/${dset}/feats_type)"
+            if [ "${_feats_type}" = raw ]; then
+                _fs=$(python3 -c "import humanfriendly as h;print(h.parse_size('${fs}'))")
+                _min_length=$(python3 -c "print(int(${min_wav_duration} * ${_fs}))")
+                _max_length=$(python3 -c "print(int(${max_wav_duration} * ${_fs}))")
+
+                # utt2num_samples is created by format_wav_scp.sh
+                <"${data_feats}/org/${dset}/utt2num_samples" \
+                    awk -v min_length="${_min_length}" -v max_length="${_max_length}" \
+                        '{ if ($2 > min_length && $2 < max_length ) print $0; }' \
+                        >"${data_feats}/${dset}/utt2num_samples"
+                for spk in ${_spk_list} "wav"; do
+                    <"${data_feats}/org/${dset}/${spk}.scp" \
+                        utils/filter_scp.pl "${data_feats}/${dset}/utt2num_samples"  \
+                        >"${data_feats}/${dset}/${spk}.scp"
+                done
+            else
+                # Get frame shift in ms from conf/fbank.conf
+                _frame_shift=
+                if [ -f conf/fbank.conf ] && [ "$(<conf/fbank.conf grep -c frame-shift)" -gt 0 ]; then
+                    # Assume using conf/fbank.conf for feature extraction
+                    _frame_shift="$(<conf/fbank.conf grep frame-shift | sed -e 's/[-a-z =]*\([0-9]*\)/\1/g')"
+                fi
+                if [ -z "${_frame_shift}" ]; then
+                    # If not existing, use the default number in Kaldi (=10ms).
+                    # If you are using different number, you have to change the following value manually.
+                    _frame_shift=10
+                fi
+
+                _min_length=$(python3 -c "print(int(${min_wav_duration} / ${_frame_shift} * 1000))")
+                _max_length=$(python3 -c "print(int(${max_wav_duration} / ${_frame_shift} * 1000))")
+
+                cp "${data_feats}/org/${dset}/feats_dim" "${data_feats}/${dset}/feats_dim"
+                <"${data_feats}/org/${dset}/feats_shape" awk -F, ' { print $1 } ' \
+                    | awk -v min_length="${_min_length}" -v max_length="${_max_length}" \
+                        '{ if ($2 > min_length && $2 < max_length) print $0; }' \
+                        >"${data_feats}/${dset}/feats_shape"
+                <"${data_feats}/org/${dset}/feats.scp" \
+                    utils/filter_scp.pl "${data_feats}/${dset}/feats_shape"  \
+                    >"${data_feats}/${dset}/feats.scp"
+            fi
+
+            # Remove empty text
+            <"${data_feats}/org/${dset}/text" \
+                awk ' { if( NF != 1 ) print $0; } ' >"${data_feats}/${dset}/text"
+
+            # fix_data_dir.sh leaves only utts which exist in all files
+            utils/fix_data_dir.sh --utt_extra_files "${utt_extra_files}" "${data_feats}/${dset}"
+            for utt_extra_file in ${utt_extra_files}; do
+                python pyscripts/utils/remove_duplicate_keys.py ${data_feats}/${dset}/${utt_extra_file} \
+                    > ${data_feats}/${dset}/${utt_extra_file}.tmp 
+                mv ${data_feats}/${dset}/${utt_extra_file}.tmp ${data_feats}/${dset}/${utt_extra_file}
+            done 
+        done
+
+        # shellcheck disable=SC2002
+        cat ${lm_train_text} | awk ' { if( NF != 1 ) print $0; } ' > "${data_feats}/lm_train.txt"
+    fi
+
+    if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+        # Combine source and target texts when using joint tokenization
+        if "${token_joint}"; then
+            log "Merge src and target data if joint BPE"
+
+            cat $tgt_bpe_train_text > ${data_feats}/${train_set}/text.${src_lang}_${tgt_lang}
+            [ -n "${src_bpe_train_text}" ] && cat ${src_bpe_train_text} >> ${data_feats}/${train_set}/text.${src_lang}_${tgt_lang}
+            # Set the new text as the target text
+            tgt_bpe_train_text="${data_feats}/${train_set}/text.${src_lang}_${tgt_lang}"
+        fi
+
+        # First generate tgt lang
+        if [ "${tgt_token_type}" = bpe ]; then
+            log "Stage 5a: Generate token_list from ${tgt_bpe_train_text} using BPE for tgt_lang"
+
+            mkdir -p "${tgt_bpedir}"
+            # shellcheck disable=SC2002
+            cat ${tgt_bpe_train_text} | cut -f 2- -d" "  > "${tgt_bpedir}"/train.txt
+
+            if [ -n "${tgt_bpe_nlsyms}" ]; then
+                _opts_spm="--user_defined_symbols=${tgt_bpe_nlsyms}"
+            else
+                _opts_spm=""
+            fi
+
+            spm_train \
+                --input="${tgt_bpedir}"/train.txt \
+                --vocab_size="${tgt_nbpe}" \
+                --model_type="${tgt_bpemode}" \
+                --model_prefix="${tgt_bpeprefix}" \
+                --character_coverage=${tgt_bpe_char_cover} \
+                --input_sentence_size="${tgt_bpe_input_sentence_size}" \
+                ${_opts_spm}
+
+            {
+            echo "${blank}"
+            echo "${oov}"
+            # Remove <unk>, <s>, </s> from the vocabulary
+            <"${tgt_bpeprefix}".vocab awk '{ if( NR != 1 && NR != 2 && NR != 3 ){ print $1; } }'
+            echo "${sos_eos}"
+            } > "${tgt_token_list}"
+
+        elif [ "${tgt_token_type}" = char ] || [ "${tgt_token_type}" = word ]; then
+            log "Stage 5a: Generate character level token_list from ${tgt_bpe_train_text}  for tgt_lang"
+
+            _opts="--non_linguistic_symbols ${nlsyms_txt}"
+
+            # shellcheck disable=SC2002
+            cat ${tgt_bpe_train_text} | cut -f 2- -d" "  > "${data_feats}"/token_train.txt
+
+            # The first symbol in token_list must be "<blank>" and the last must be also sos/eos:
+            # 0 is reserved for CTC-blank for ST and also used as ignore-index in the other task
+            ${python} -m espnet2.bin.tokenize_text  \
+                --token_type "${tgt_token_type}" \
+                --input "${data_feats}/token_train.txt" --output "${tgt_token_list}" ${_opts} \
+                --field 2- \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --write_vocabulary true \
+                --add_symbol "${blank}:0" \
+                --add_symbol "${oov}:1" \
+                --add_symbol "${sos_eos}:-1"
+
+        else
+            log "Error: not supported --token_type '${tgt_token_type}'"
+            exit 2
+        fi
+
+        # Create word-list for word-LM training
+        if ${use_word_lm} && [ "${tgt_token_type}" != word ]; then
+            log "Generate word level token_list from ${data_feats}/lm_train.txt"
+            ${python} -m espnet2.bin.tokenize_text \
+                --token_type word \
+                --input "${data_feats}/lm_train.txt" --output "${lm_token_list}" \
+                --field 2- \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --write_vocabulary true \
+                --vocabulary_size "${word_vocab_size}" \
+                --add_symbol "${blank}:0" \
+                --add_symbol "${oov}:1" \
+                --add_symbol "${sos_eos}:-1"
+        fi
+
+        # Then generate src lang
+        if "${token_joint}"; then
+            log "Stage 5b: Skip separate token construction for src_lang when setting ${token_joint} as true"
+        else
+            if [ "${src_token_type}" = bpe ]; then
+                log "Stage 5b: Generate token_list from ${src_bpe_train_text} using BPE for src_lang"
+
+                mkdir -p "${src_bpedir}"
+                # shellcheck disable=SC2002
+                cat ${src_bpe_train_text} | cut -f 2- -d" "  > "${src_bpedir}"/train.txt
+
+                if [ -n "${src_bpe_nlsyms}" ]; then
+                    _opts_spm="--user_defined_symbols=${src_bpe_nlsyms}"
+                else
+                    _opts_spm=""
+                fi
+
+                spm_train \
+                    --input="${src_bpedir}"/train.txt \
+                    --vocab_size="${src_nbpe}" \
+                    --model_type="${src_bpemode}" \
+                    --model_prefix="${src_bpeprefix}" \
+                    --character_coverage=${src_bpe_char_cover} \
+                    --input_sentence_size="${src_bpe_input_sentence_size}" \
+                    ${_opts_spm}
+
+                {
+                echo "${blank}"
+                echo "${oov}"
+                # Remove <unk>, <s>, </s> from the vocabulary
+                <"${src_bpeprefix}".vocab awk '{ if( NR != 1 && NR != 2 && NR != 3 ){ print $1; } }'
+                echo "${sos_eos}"
+                } > "${src_token_list}"
+
+            elif [ "${src_token_type}" = char ] || [ "${src_token_type}" = word ]; then
+                log "Stage 5b: Generate character level token_list from ${src_bpe_train_text}  for src_lang"
+
+                _opts="--non_linguistic_symbols ${nlsyms_txt}"
+
+                # shellcheck disable=SC2002
+                cat ${src_bpe_train_text} | cut -f 2- -d" "  > "${data_feats}"/token_train.txt
+
+                # The first symbol in token_list must be "<blank>" and the last must be also sos/eos:
+                # 0 is reserved for CTC-blank for ST and also used as ignore-index in the other task
+                ${python} -m espnet2.bin.tokenize_text  \
+                    --token_type "${src_token_type}" \
+                    --input "${data_feats}/token_train.txt" --output "${src_token_list}" ${_opts} \
+                    --field 2- \
+                    --cleaner "${cleaner}" \
+                    --g2p "${g2p}" \
+                    --write_vocabulary true \
+                    --add_symbol "${blank}:0" \
+                    --add_symbol "${oov}:1" \
+                    --add_symbol "${sos_eos}:-1"
+
+            else
+                log "Error: not supported --token_type '${src_token_type}'"
+                exit 2
+            fi
+
+
+        fi
+    fi
+
+else
+    log "Skip the stages for data preparation"
+fi
+
+
+# ========================== Data preparation is done here. ==========================
+
+
+if ! "${skip_train}"; then
+    if "${use_lm}"; then
+        if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+            log "Stage 6: LM collect stats: train_set=${data_feats}/lm_train.txt, dev_set=${lm_dev_text}"
+
+            _opts=
+            if [ -n "${lm_config}" ]; then
+                # To generate the config file: e.g.
+                #   % python3 -m espnet2.bin.lm_train --print_config --optim adam
+                _opts+="--config ${lm_config} "
+            fi
+
+            # 1. Split the key file
+            _logdir="${lm_stats_dir}/logdir"
+            mkdir -p "${_logdir}"
+            # Get the minimum number among ${nj} and the number lines of input files
+            _nj=$(min "${nj}" "$(<${data_feats}/lm_train.txt wc -l)" "$(<${lm_dev_text} wc -l)")
+
+            key_file="${data_feats}/lm_train.txt"
+            split_scps=""
+            for n in $(seq ${_nj}); do
+                split_scps+=" ${_logdir}/train.${n}.scp"
+            done
+            # shellcheck disable=SC2086
+            utils/split_scp.pl "${key_file}" ${split_scps}
+
+            key_file="${lm_dev_text}"
+            split_scps=""
+            for n in $(seq ${_nj}); do
+                split_scps+=" ${_logdir}/dev.${n}.scp"
+            done
+            # shellcheck disable=SC2086
+            utils/split_scp.pl "${key_file}" ${split_scps}
+
+            # 2. Generate run.sh
+            log "Generate '${lm_stats_dir}/run.sh'. You can resume the process from stage 6 using this script"
+            mkdir -p "${lm_stats_dir}"; echo "${run_args} --stage 6 \"\$@\"; exit \$?" > "${lm_stats_dir}/run.sh"; chmod +x "${lm_stats_dir}/run.sh"
+
+            # 3. Submit jobs
+            log "LM collect-stats started... log: '${_logdir}/stats.*.log'"
+            # NOTE: --*_shape_file doesn't require length information if --batch_type=unsorted,
+            #       but it's used only for deciding the sample ids.
+            # shellcheck disable=SC2086
+            ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \
+                ${python} -m espnet2.bin.lm_train \
+                    --collect_stats true \
+                    --use_preprocessor true \
+                    --bpemodel "${tgt_bpemodel}" \
+                    --token_type "${lm_token_type}"\
+                    --token_list "${lm_token_list}" \
+                    --non_linguistic_symbols "${nlsyms_txt}" \
+                    --cleaner "${cleaner}" \
+                    --g2p "${g2p}" \
+                    --train_data_path_and_name_and_type "${data_feats}/lm_train.txt,text,text" \
+                    --valid_data_path_and_name_and_type "${lm_dev_text},text,text" \
+                    --train_shape_file "${_logdir}/train.JOB.scp" \
+                    --valid_shape_file "${_logdir}/dev.JOB.scp" \
+                    --output_dir "${_logdir}/stats.JOB" \
+                    ${_opts} ${lm_args} || { cat "${_logdir}"/stats.1.log; exit 1; }
+
+            # 4. Aggregate shape files
+            _opts=
+            for i in $(seq "${_nj}"); do
+                _opts+="--input_dir ${_logdir}/stats.${i} "
+            done
+            # shellcheck disable=SC2086
+            ${python} -m espnet2.bin.aggregate_stats_dirs ${_opts} --output_dir "${lm_stats_dir}"
+
+            # Append the num-tokens at the last dimensions. This is used for batch-bins count
+            <"${lm_stats_dir}/train/text_shape" \
+                awk -v N="$(<${lm_token_list} wc -l)" '{ print $0 "," N }' \
+                >"${lm_stats_dir}/train/text_shape.${lm_token_type}"
+
+            <"${lm_stats_dir}/valid/text_shape" \
+                awk -v N="$(<${lm_token_list} wc -l)" '{ print $0 "," N }' \
+                >"${lm_stats_dir}/valid/text_shape.${lm_token_type}"
+        fi
+
+
+        if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+            log "Stage 7: LM Training: train_set=${data_feats}/lm_train.txt, dev_set=${lm_dev_text}"
+
+            _opts=
+            if [ -n "${lm_config}" ]; then
+                # To generate the config file: e.g.
+                #   % python3 -m espnet2.bin.lm_train --print_config --optim adam
+                _opts+="--config ${lm_config} "
+            fi
+
+            if [ "${num_splits_lm}" -gt 1 ]; then
+                # If you met a memory error when parsing text files, this option may help you.
+                # The corpus is split into subsets and each subset is used for training one by one in order,
+                # so the memory footprint can be limited to the memory required for each dataset.
+
+                _split_dir="${lm_stats_dir}/splits${num_splits_lm}"
+                if [ ! -f "${_split_dir}/.done" ]; then
+                    rm -f "${_split_dir}/.done"
+                    ${python} -m espnet2.bin.split_scps \
+                      --scps "${data_feats}/lm_train.txt" "${lm_stats_dir}/train/text_shape.${lm_token_type}" \
+                      --num_splits "${num_splits_lm}" \
+                      --output_dir "${_split_dir}"
+                    touch "${_split_dir}/.done"
+                else
+                    log "${_split_dir}/.done exists. Spliting is skipped"
+                fi
+
+                _opts+="--train_data_path_and_name_and_type ${_split_dir}/lm_train.txt,text,text "
+                _opts+="--train_shape_file ${_split_dir}/text_shape.${lm_token_type} "
+                _opts+="--multiple_iterator true "
+
+            else
+                _opts+="--train_data_path_and_name_and_type ${data_feats}/lm_train.txt,text,text "
+                _opts+="--train_shape_file ${lm_stats_dir}/train/text_shape.${lm_token_type} "
+            fi
+
+            # NOTE(kamo): --fold_length is used only if --batch_type=folded and it's ignored in the other case
+
+            log "Generate '${lm_exp}/run.sh'. You can resume the process from stage 7 using this script"
+            mkdir -p "${lm_exp}"; echo "${run_args} --stage 7 \"\$@\"; exit \$?" > "${lm_exp}/run.sh"; chmod +x "${lm_exp}/run.sh"
+
+            log "LM training started... log: '${lm_exp}/train.log'"
+            if echo "${cuda_cmd}" | grep -e queue.pl -e queue-freegpu.pl &> /dev/null; then
+                # SGE can't include "/" in a job name
+                jobname="$(basename ${lm_exp})"
+            else
+                jobname="${lm_exp}/train.log"
+            fi
+
+            # TODO(jiatong): fix bpe
+            # shellcheck disable=SC2086
+            ${python} -m espnet2.bin.launch \
+                --cmd "${cuda_cmd} --name ${jobname}" \
+                --log "${lm_exp}"/train.log \
+                --ngpu "${ngpu}" \
+                --num_nodes "${num_nodes}" \
+                --init_file_prefix "${lm_exp}"/.dist_init_ \
+                --multiprocessing_distributed true -- \
+                ${python} -m espnet2.bin.lm_train \
+                    --ngpu "${ngpu}" \
+                    --use_preprocessor true \
+                    --bpemodel "${tgt_bpemodel}" \
+                    --token_type "${lm_token_type}"\
+                    --token_list "${lm_token_list}" \
+                    --non_linguistic_symbols "${nlsyms_txt}" \
+                    --cleaner "${cleaner}" \
+                    --g2p "${g2p}" \
+                    --valid_data_path_and_name_and_type "${lm_dev_text},text,text" \
+                    --valid_shape_file "${lm_stats_dir}/valid/text_shape.${lm_token_type}" \
+                    --fold_length "${lm_fold_length}" \
+                    --resume true \
+                    --output_dir "${lm_exp}" \
+                    ${_opts} ${lm_args}
+
+        fi
+
+
+        if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
+            log "Stage 8: Calc perplexity: ${lm_test_text}"
+            _opts=
+            # TODO(kamo): Parallelize?
+            log "Perplexity calculation started... log: '${lm_exp}/perplexity_test/lm_calc_perplexity.log'"
+            # shellcheck disable=SC2086
+            ${cuda_cmd} --gpu "${ngpu}" "${lm_exp}"/perplexity_test/lm_calc_perplexity.log \
+                ${python} -m espnet2.bin.lm_calc_perplexity \
+                    --ngpu "${ngpu}" \
+                    --data_path_and_name_and_type "${lm_test_text},text,text" \
+                    --train_config "${lm_exp}"/config.yaml \
+                    --model_file "${lm_exp}/${inference_lm}" \
+                    --output_dir "${lm_exp}/perplexity_test" \
+                    ${_opts}
+            log "PPL: ${lm_test_text}: $(cat ${lm_exp}/perplexity_test/ppl)"
+
+        fi
+
+    else
+        log "Stage 6-8: Skip lm-related stages: use_lm=${use_lm}"
+    fi
+
+
+    if "${use_ngram}"; then
+        mkdir -p ${ngram_exp}
+    fi
+    if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
+        if "${use_ngram}"; then
+            log "Stage 9: Ngram Training: train_set=${data_feats}/lm_train.txt"
+            cut -f 2 -d " " ${data_feats}/lm_train.txt | lmplz -S "20%" --discount_fallback -o ${ngram_num} - >${ngram_exp}/${ngram_num}gram.arpa
+            build_binary -s ${ngram_exp}/${ngram_num}gram.arpa ${ngram_exp}/${ngram_num}gram.bin 
+        else
+            log "Stage 9: Skip ngram stages: use_ngram=${use_ngram}"
+        fi
+    fi
+
+
+    if [ ${stage} -le 10 ] && [ ${stop_stage} -ge 10 ]; then
+        _enh_st_train_dir="${data_feats}/${train_set}"
+        _enh_st_valid_dir="${data_feats}/${valid_set}"
+        log "Stage 10: ST collect stats: train_set=${_enh_st_train_dir}, valid_set=${_enh_st_valid_dir}"
+
+        _opts=
+        if [ -n "${enh_st_config}" ]; then
+            # To generate the config file: e.g.
+            #   % python3 -m espnet2.bin.enh_s2t_train --print_config --optim adam
+            _opts+="--config ${enh_st_config} "
+        fi
+
+        _feats_type="$(<${_enh_st_train_dir}/feats_type)"
+        if [ "${_feats_type}" = raw ]; then
+            _scp=wav.scp
+            if [[ "${audio_format}" == *ark* ]]; then
+                _type=kaldi_ark
+            else
+                # "sound" supports "wav", "flac", etc.
+                _type=sound
+            fi
+            _opts+="--frontend_conf fs=${fs} "
+        else
+            _scp=feats.scp
+            _type=kaldi_ark
+            _input_size="$(<${_enh_st_train_dir}/feats_dim)"
+            _opts+="--input_size=${_input_size} "
+        fi
+
+        # 1. Split the key file
+        _logdir="${enh_st_stats_dir}/logdir"
+        mkdir -p "${_logdir}"
+
+        # Get the minimum number among ${nj} and the number lines of input files
+        _nj=$(min "${nj}" "$(<${_enh_st_train_dir}/${_scp} wc -l)" "$(<${_enh_st_valid_dir}/${_scp} wc -l)")
+
+        key_file="${_enh_st_train_dir}/${_scp}"
+        split_scps=""
+        for n in $(seq "${_nj}"); do
+            split_scps+=" ${_logdir}/train.${n}.scp"
+        done
+        # shellcheck disable=SC2086
+        utils/split_scp.pl "${key_file}" ${split_scps}
+
+        key_file="${_enh_st_valid_dir}/${_scp}"
+        split_scps=""
+        for n in $(seq "${_nj}"); do
+            split_scps+=" ${_logdir}/valid.${n}.scp"
+        done
+        # shellcheck disable=SC2086
+        utils/split_scp.pl "${key_file}" ${split_scps}
+
+        # 2. Generate run.sh
+        log "Generate '${enh_st_stats_dir}/run.sh'. You can resume the process from stage 10 using this script"
+        mkdir -p "${enh_st_stats_dir}"; echo "${run_args} --stage 10 \"\$@\"; exit \$?" > "${enh_st_stats_dir}/run.sh"; chmod +x "${enh_st_stats_dir}/run.sh"
+
+        # 3. Submit jobs
+        log "ST collect-stats started... log: '${_logdir}/stats.*.log'"
+
+        # NOTE: --*_shape_file doesn't require length information if --batch_type=unsorted,
+        #       but it's used only for deciding the sample ids.
+
+        # TODO(jiatong): fix different bpe model
+        # shellcheck disable=SC2086
+        ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \
+            ${python} -m espnet2.bin.enh_s2t_train \
+                --collect_stats true \
+                --use_preprocessor true \
+                --bpemodel "${tgt_bpemodel}" \
+                --src_bpemodel "${src_bpemodel}" \
+                --token_type "${tgt_token_type}" \
+                --src_token_type "${src_token_type}" \
+                --token_list "${tgt_token_list}" \
+                --src_token_list "${src_token_list}" \
+                --non_linguistic_symbols "${nlsyms_txt}" \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --train_data_path_and_name_and_type "${_enh_st_train_dir}/${_scp},speech,${_type}" \
+                --train_data_path_and_name_and_type "${_enh_st_train_dir}/${_scp},speech_ref1,${_type}" \
+                --train_data_path_and_name_and_type "${_enh_st_train_dir}/text.${tgt_case}.${tgt_lang},text,text" \
+                --train_data_path_and_name_and_type "${_enh_st_train_dir}/text.${src_case}.${src_lang},src_text,text" \
+                --valid_data_path_and_name_and_type "${_enh_st_valid_dir}/${_scp},speech,${_type}" \
+                --valid_data_path_and_name_and_type "${_enh_st_valid_dir}/${_scp},speech_ref1,${_type}" \
+                --valid_data_path_and_name_and_type "${_enh_st_valid_dir}/text.${tgt_case}.${tgt_lang},text,text" \
+                --valid_data_path_and_name_and_type "${_enh_st_valid_dir}/text.${src_case}.${src_lang},src_text,text" \
+                --train_shape_file "${_logdir}/train.JOB.scp" \
+                --valid_shape_file "${_logdir}/valid.JOB.scp" \
+                --output_dir "${_logdir}/stats.JOB" \
+                ${_opts} ${enh_st_args} || { cat "${_logdir}"/stats.1.log; exit 1; }
+
+        # 4. Aggregate shape files
+        _opts=
+        for i in $(seq "${_nj}"); do
+            _opts+="--input_dir ${_logdir}/stats.${i} "
+        done
+        # shellcheck disable=SC2086
+        ${python} -m espnet2.bin.aggregate_stats_dirs ${_opts} --output_dir "${enh_st_stats_dir}"
+
+        # Append the num-tokens at the last dimensions. This is used for batch-bins count
+        <"${enh_st_stats_dir}/train/text_shape" \
+            awk -v N="$(<${tgt_token_list} wc -l)" '{ print $0 "," N }' \
+            >"${enh_st_stats_dir}/train/text_shape.${tgt_token_type}"
+
+        <"${enh_st_stats_dir}/train/src_text_shape" \
+            awk -v N="$(<${src_token_list} wc -l)" '{ print $0 "," N }' \
+            >"${enh_st_stats_dir}/train/src_text_shape.${src_token_type}"
+
+        <"${enh_st_stats_dir}/valid/text_shape" \
+            awk -v N="$(<${tgt_token_list} wc -l)" '{ print $0 "," N }' \
+            >"${enh_st_stats_dir}/valid/text_shape.${tgt_token_type}"
+
+        <"${enh_st_stats_dir}/valid/src_text_shape" \
+            awk -v N="$(<${src_token_list} wc -l)" '{ print $0 "," N }' \
+            >"${enh_st_stats_dir}/valid/src_text_shape.${src_token_type}"
+    fi
+
+
+    if [ ${stage} -le 11 ] && [ ${stop_stage} -ge 11 ]; then
+        _enh_st_train_dir="${data_feats}/${train_set}"
+        _enh_st_valid_dir="${data_feats}/${valid_set}"
+        log "Stage 11: ST Training: train_set=${_enh_st_train_dir}, valid_set=${_enh_st_valid_dir}"
+
+        _opts=
+        if [ -n "${enh_st_config}" ]; then
+            # To generate the config file: e.g.
+            #   % python3 -m espnet2.bin.enh_s2t_train --print_config --optim adam
+            _opts+="--config ${enh_st_config} "
+        fi
+
+        _feats_type="$(<${_enh_st_train_dir}/feats_type)"
+        if [ "${_feats_type}" = raw ]; then
+            _scp=wav.scp
+            # "sound" supports "wav", "flac", etc.
+            if [[ "${audio_format}" == *ark* ]]; then
+                _type=kaldi_ark
+            else
+                _type=sound
+            fi
+            _fold_length="$((enh_st_speech_fold_length * 100))"
+            _opts+="--frontend_conf fs=${fs} "
+        else
+            _scp=feats.scp
+            _type=kaldi_ark
+            _fold_length="${enh_st_speech_fold_length}"
+            _input_size="$(<${_enh_st_train_dir}/feats_dim)"
+            _opts+="--input_size=${_input_size} "
+
+        fi
+        if [ "${feats_normalize}" = global_mvn ]; then
+            # Default normalization is utterance_mvn and changes to global_mvn
+            _opts+="--normalize=global_mvn --normalize_conf stats_file=${enh_st_stats_dir}/train/feats_stats.npz "
+        fi
+
+        if [ "${num_splits_st}" -gt 1 ]; then
+            # If you met a memory error when parsing text files, this option may help you.
+            # The corpus is split into subsets and each subset is used for training one by one in order,
+            # so the memory footprint can be limited to the memory required for each dataset.
+
+            _split_dir="${enh_st_stats_dir}/splits${num_splits_st}"
+            if [ ! -f "${_split_dir}/.done" ]; then
+                rm -f "${_split_dir}/.done"
+                ${python} -m espnet2.bin.split_scps \
+                  --scps \
+                      "${_enh_st_train_dir}/${_scp}" \
+                      "${_enh_st_train_dir}/text.${tgt_case}.${tgt_lang}" \
+                      "${_enh_st_train_dir}/text.${src_case}.${src_lang}" \
+                      "${enh_st_stats_dir}/train/speech_shape" \
+                      "${enh_st_stats_dir}/train/speech_ref1_shape" \
+                      "${enh_st_stats_dir}/train/text_shape.${tgt_token_type}" \
+                      "${enh_st_stats_dir}/train/src_text_shape.${src_token_type}" \
+                  --num_splits "${num_splits_st}" \
+                  --output_dir "${_split_dir}"
+                touch "${_split_dir}/.done"
+            else
+                log "${_split_dir}/.done exists. Spliting is skipped"
+            fi
+
+            _opts+="--train_data_path_and_name_and_type ${_split_dir}/${_scp},speech,${_type} "
+            _opts+="--train_data_path_and_name_and_type ${_split_dir}/spk1.scp,speech_ref1,${_type} "
+            _opts+="--train_data_path_and_name_and_type ${_split_dir}/text.${tgt_case}.${tgt_lang},text,text "
+            _opts+="--train_data_path_and_name_and_type ${_split_dir}/text.${src_case}.${src_lang},src_text,text "
+            _opts+="--train_shape_file ${_split_dir}/speech_shape "
+            _opts+="--train_shape_file ${_split_dir}/speech_ref1_shape "
+            _opts+="--train_shape_file ${_split_dir}/text_shape.${tgt_token_type} "
+            _opts+="--train_shape_file ${_split_dir}/src_text_shape.${src_token_type} "
+            _opts+="--multiple_iterator true "
+        else
+            _opts+="--train_data_path_and_name_and_type ${_enh_st_train_dir}/${_scp},speech,${_type} "
+            _opts+="--train_data_path_and_name_and_type ${_enh_st_train_dir}/spk1.scp,speech_ref1,${_type} "
+            _opts+="--train_data_path_and_name_and_type ${_enh_st_train_dir}/text.${tgt_case}.${tgt_lang},text,text "
+            _opts+="--train_data_path_and_name_and_type ${_enh_st_train_dir}/text.${src_case}.${src_lang},src_text,text "
+            _opts+="--train_shape_file ${enh_st_stats_dir}/train/speech_shape "
+            _opts+="--train_shape_file ${enh_st_stats_dir}/train/speech_ref1_shape "
+            _opts+="--train_shape_file ${enh_st_stats_dir}/train/text_shape.${tgt_token_type} "
+            _opts+="--train_shape_file ${enh_st_stats_dir}/train/src_text_shape.${src_token_type} "
+        fi
+
+        log "Generate '${enh_st_exp}/run.sh'. You can resume the process from stage 11 using this script"
+        mkdir -p "${enh_st_exp}"; echo "${run_args} --stage 11 \"\$@\"; exit \$?" > "${enh_st_exp}/run.sh"; chmod +x "${enh_st_exp}/run.sh"
+
+        # NOTE(kamo): --fold_length is used only if --batch_type=folded and it's ignored in the other case
+        log "ST training started... log: '${enh_st_exp}/train.log'"
+        if echo "${cuda_cmd}" | grep -e queue.pl -e queue-freegpu.pl &> /dev/null; then
+            # SGE can't include "/" in a job name
+            jobname="$(basename ${enh_st_exp})"
+        else
+            jobname="${enh_st_exp}/train.log"
+        fi
+
+        # TODO(jiatong): fix bpe
+        # shellcheck disable=SC2086
+        ${python} -m espnet2.bin.launch \
+            --cmd "${cuda_cmd} --name ${jobname}" \
+            --log "${enh_st_exp}"/train.log \
+            --ngpu "${ngpu}" \
+            --num_nodes "${num_nodes}" \
+            --init_file_prefix "${enh_st_exp}"/.dist_init_ \
+            --multiprocessing_distributed true -- \
+            ${python} -m espnet2.bin.enh_s2t_train \
+                --use_preprocessor true \
+                --bpemodel "${tgt_bpemodel}" \
+                --token_type "${tgt_token_type}" \
+                --token_list "${tgt_token_list}" \
+                --src_bpemodel "${src_bpemodel}" \
+                --src_token_type "${src_token_type}" \
+                --src_token_list "${src_token_list}" \
+                --non_linguistic_symbols "${nlsyms_txt}" \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --valid_data_path_and_name_and_type "${_enh_st_valid_dir}/${_scp},speech,${_type}" \
+                --valid_data_path_and_name_and_type "${_enh_st_valid_dir}/spk1.scp,speech_ref1,${_type}" \
+                --valid_data_path_and_name_and_type "${_enh_st_valid_dir}/text.${tgt_case}.${tgt_lang},text,text" \
+                --valid_data_path_and_name_and_type "${_enh_st_valid_dir}/text.${src_case}.${src_lang},src_text,text" \
+                --valid_shape_file "${enh_st_stats_dir}/valid/speech_shape" \
+                --valid_shape_file "${enh_st_stats_dir}/valid/speech_ref1_shape" \
+                --valid_shape_file "${enh_st_stats_dir}/valid/text_shape.${tgt_token_type}" \
+                --valid_shape_file "${enh_st_stats_dir}/valid/src_text_shape.${src_token_type}" \
+                --resume true \
+                --init_param ${pretrained_model} \
+                --ignore_init_mismatch ${ignore_init_mismatch} \
+                --fold_length "${_fold_length}" \
+                --fold_length "${_fold_length}" \
+                --fold_length "${enh_st_text_fold_length}" \
+                --fold_length "${enh_st_text_fold_length}" \
+                --output_dir "${enh_st_exp}" \
+                ${_opts} ${enh_st_args}
+
+    fi
+else
+    log "Skip the training stages"
+fi
+
+
+if [ -n "${download_model}" ]; then
+    log "Use ${download_model} for decoding and evaluation"
+    enh_st_exp="${expdir}/${download_model}"
+    mkdir -p "${enh_st_exp}"
+
+    # If the model already exists, you can skip downloading
+    espnet_model_zoo_download --unpack true "${download_model}" > "${enh_st_exp}/config.txt"
+
+    # Get the path of each file
+    _enh_st_model_file=$(<"${enh_st_exp}/config.txt" sed -e "s/.*'enh_s2t_model_file': '\([^']*\)'.*$/\1/")
+    _enh_st_train_config=$(<"${enh_st_exp}/config.txt" sed -e "s/.*'enh_s2t_train_config': '\([^']*\)'.*$/\1/")
+
+    # Create symbolic links
+    ln -sf "${_enh_st_model_file}" "${enh_st_exp}"
+    ln -sf "${_enh_st_train_config}" "${enh_st_exp}"
+    inference_enh_st_model=$(basename "${_enh_st_model_file}")
+
+    if [ "$(<${enh_st_exp}/config.txt grep -c lm_file)" -gt 0 ]; then
+        _lm_file=$(<"${enh_st_exp}/config.txt" sed -e "s/.*'lm_file': '\([^']*\)'.*$/\1/")
+        _lm_train_config=$(<"${enh_st_exp}/config.txt" sed -e "s/.*'lm_train_config': '\([^']*\)'.*$/\1/")
+
+        lm_exp="${expdir}/${download_model}/lm"
+        mkdir -p "${lm_exp}"
+
+        ln -sf "${_lm_file}" "${lm_exp}"
+        ln -sf "${_lm_train_config}" "${lm_exp}"
+        inference_lm=$(basename "${_lm_file}")
+    fi
+
+fi
+
+
+if ! "${skip_eval}"; then
+    if [ ${stage} -le 12 ] && [ ${stop_stage} -ge 12 ]; then
+        log "Stage 12: Decoding: training_dir=${enh_st_exp}"
+
+        if ${gpu_inference}; then
+            _cmd="${cuda_cmd}"
+            _ngpu=1
+        else
+            _cmd="${decode_cmd}"
+            _ngpu=0
+        fi
+
+        _opts=
+        if [ -n "${inference_config}" ]; then
+            _opts+="--config ${inference_config} "
+        fi
+        if "${use_lm}"; then
+            if "${use_word_lm}"; then
+                _opts+="--word_lm_train_config ${lm_exp}/config.yaml "
+                _opts+="--word_lm_file ${lm_exp}/${inference_lm} "
+            else
+                _opts+="--lm_train_config ${lm_exp}/config.yaml "
+                _opts+="--lm_file ${lm_exp}/${inference_lm} "
+            fi
+        fi
+        if "${use_ngram}"; then
+             _opts+="--ngram_file ${ngram_exp}/${inference_ngram}"
+        fi
+
+        # 2. Generate run.sh
+        log "Generate '${enh_st_exp}/${inference_tag}/run.sh'. You can resume the process from stage 12 using this script"
+        mkdir -p "${enh_st_exp}/${inference_tag}"; echo "${run_args} --stage 12 \"\$@\"; exit \$?" > "${enh_st_exp}/${inference_tag}/run.sh"; chmod +x "${enh_st_exp}/${inference_tag}/run.sh"
+
+        for dset in ${test_sets}; do
+            _data="${data_feats}/${dset}"
+            _dir="${enh_st_exp}/${inference_tag}/${dset}"
+            _logdir="${_dir}/logdir"
+            mkdir -p "${_logdir}"
+
+            _feats_type="$(<${_data}/feats_type)"
+            if [ "${_feats_type}" = raw ]; then
+                _scp=wav.scp
+                if [[ "${audio_format}" == *ark* ]]; then
+                    _type=kaldi_ark
+                else
+                    _type=sound
+                fi
+            else
+                _scp=feats.scp
+                _type=kaldi_ark
+            fi
+
+            # 1. Split the key file
+            key_file=${_data}/${_scp}
+            split_scps=""
+            _nj=$(min "${inference_nj}" "$(<${key_file} wc -l)")
+            st_inference_tool="espnet2.bin.st_inference"
+
+            for n in $(seq "${_nj}"); do
+                split_scps+=" ${_logdir}/keys.${n}.scp"
+            done
+            # shellcheck disable=SC2086
+            utils/split_scp.pl "${key_file}" ${split_scps}
+
+            # 2. Submit decoding jobs
+            log "Decoding started... log: '${_logdir}/st_inference.*.log'"
+            # shellcheck disable=SC2086
+            ${_cmd} --gpu "${_ngpu}" JOB=1:"${_nj}" "${_logdir}"/st_inference.JOB.log \
+                ${python} -m ${st_inference_tool} \
+                    --enh_s2t_task true \
+                    --batch_size ${batch_size} \
+                    --ngpu "${_ngpu}" \
+                    --data_path_and_name_and_type "${_data}/${_scp},speech,${_type}" \
+                    --key_file "${_logdir}"/keys.JOB.scp \
+                    --st_train_config "${enh_st_exp}"/config.yaml \
+                    --st_model_file "${enh_st_exp}"/"${inference_enh_st_model}" \
+                    --output_dir "${_logdir}"/output.JOB \
+                    ${_opts} ${st_inference_args}
+
+            # 3. Concatenates the output files from each jobs
+            for f in token token_int score text; do
+                for i in $(seq "${_nj}"); do
+                    cat "${_logdir}/output.${i}/1best_recog/${f}"
+                done | LC_ALL=C sort -k1 >"${_dir}/${f}"
+            done
+
+        done
+    fi
+    if [ ${stage} -le 13 ] && [ ${stop_stage} -ge 13 ]; then
+        log "Stage 13: Enhance Speech: training_dir=${enh_st_exp}"
+
+        if ${gpu_inference}; then
+            _cmd="${cuda_cmd}"
+            _ngpu=1
+        else
+            _cmd="${decode_cmd}"
+            _ngpu=0
+        fi
+
+        _opts=
+
+        # 2. Generate run.sh
+        log "Generate '${enh_st_exp}/run_enhance.sh'. You can resume the process from stage 13 using this script"
+        mkdir -p "${enh_st_exp}"; echo "${run_args} --stage 13 \"\$@\"; exit \$?" > "${enh_st_exp}/run_enhance.sh"; chmod +x "${enh_st_exp}/run_enhance.sh"
+
+        for dset in ${test_sets}; do
+            _data="${data_feats}/${dset}"
+            _dir="${enh_st_exp}/${inference_enh_tag}_${dset}"
+            _logdir="${_dir}/logdir"
+            mkdir -p "${_logdir}"
+
+            _scp=wav.scp
+            if [[ "${audio_format}" == *ark* ]]; then
+                _type=kaldi_ark
+            else
+                _type=sound
+            fi
+
+            # 1. Split the key file
+            key_file=${_data}/${_scp}
+            split_scps=""
+            _nj=$(min "${inference_nj}" "$(<${key_file} wc -l)")
+            for n in $(seq "${_nj}"); do
+                split_scps+=" ${_logdir}/keys.${n}.scp"
+            done
+            # shellcheck disable=SC2086
+            utils/split_scp.pl "${key_file}" ${split_scps}
+
+            # 2. Submit inference jobs
+            log "Enhancement started... log: '${_logdir}/enh_inference.*.log'"
+            # shellcheck disable=SC2086
+            ${_cmd} --gpu "${_ngpu}" JOB=1:"${_nj}" "${_logdir}"/enh_inference.JOB.log \
+                ${python} -m espnet2.bin.enh_inference \
+                    --enh_s2t_task true \
+                    --ngpu "${_ngpu}" \
+                    --fs "${fs}" \
+                    --data_path_and_name_and_type "${_data}/${_scp},speech_mix,${_type}" \
+                    --key_file "${_logdir}"/keys.JOB.scp \
+                    --train_config "${enh_st_exp}"/config.yaml \
+                    ${inference_enh_config:+--inference_config "$inference_enh_config"} \
+                    --model_file "${enh_st_exp}"/"${inference_enh_st_model}" \
+                    --output_dir "${_logdir}"/output.JOB \
+                    ${_opts} ${enh_inference_args}
+
+            # 3. Concatenates the output files from each jobs
+            _spk_list=" "
+            for i in $(seq ${spk_num}); do
+                _spk_list+="spk${i} "
+            done
+
+            for spk in ${_spk_list}; do
+                for i in $(seq "${_nj}"); do
+                    cat "${_logdir}/output.${i}/${spk}.scp"
+                done | LC_ALL=C sort -k1 > "${_dir}/${spk}.scp"
+            done
+        done
+    fi
+
+
+    if [ ${stage} -le 14 ] && [ ${stop_stage} -ge 14 ]; then
+        log "Stage 14: Scoring Translation"
+
+        for dset in ${test_sets}; do
+            _data="${data_feats}/${dset}"
+            _dir="${enh_st_exp}/${inference_tag}/${dset}"
+
+            # TODO(jiatong): add asr scoring and inference
+
+            _scoredir="${_dir}/score_bleu"
+            mkdir -p "${_scoredir}"
+
+            paste \
+                <(<"${_data}/text.${tgt_case}.${tgt_lang}" \
+                    ${python} -m espnet2.bin.tokenize_text  \
+                        -f 2- --input - --output - \
+                        --token_type word \
+                        --non_linguistic_symbols "${nlsyms_txt}" \
+                        --remove_non_linguistic_symbols true \
+                        --cleaner "${cleaner}" \
+                        ) \
+                <(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \
+                    >"${_scoredir}/ref.trn.org"
+
+            # NOTE(kamo): Don't use cleaner for hyp
+            paste \
+                <(<"${_dir}/text"  \
+                        ${python} -m espnet2.bin.tokenize_text  \
+                            -f 2- --input - --output - \
+                            --token_type word \
+                            --non_linguistic_symbols "${nlsyms_txt}" \
+                            --remove_non_linguistic_symbols true \
+                            ) \
+                <(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \
+                    >"${_scoredir}/hyp.trn.org"
+
+            # remove utterance id
+            perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/ref.trn.org" > "${_scoredir}/ref.trn"
+            perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/hyp.trn.org" > "${_scoredir}/hyp.trn"
+
+            # detokenizer
+            detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/ref.trn" > "${_scoredir}/ref.trn.detok"
+            detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/hyp.trn" > "${_scoredir}/hyp.trn.detok"
+
+            if [ ${tgt_case} = "tc" ]; then
+                echo "Case sensitive BLEU result (single-reference)" >> ${_scoredir}/result.tc.txt
+                sacrebleu "${_scoredir}/ref.trn.detok" \
+                          -i "${_scoredir}/hyp.trn.detok" \
+                          -m bleu chrf ter \
+                          >> ${_scoredir}/result.tc.txt
+
+                log "Write a case-sensitive BLEU (single-reference) result in ${_scoredir}/result.tc.txt"
+            fi
+
+            # detokenize & remove punctuation except apostrophe
+            remove_punctuation.pl < "${_scoredir}/ref.trn.detok" > "${_scoredir}/ref.trn.detok.lc.rm"
+            remove_punctuation.pl < "${_scoredir}/hyp.trn.detok" > "${_scoredir}/hyp.trn.detok.lc.rm"
+            echo "Case insensitive BLEU result (single-reference)" >> ${_scoredir}/result.lc.txt
+            sacrebleu -lc "${_scoredir}/ref.trn.detok.lc.rm" \
+                      -i "${_scoredir}/hyp.trn.detok.lc.rm" \
+                      -m bleu chrf ter \
+                      >> ${_scoredir}/result.lc.txt
+            log "Write a case-insensitve BLEU (single-reference) result in ${_scoredir}/result.lc.txt"
+
+            # process multi-references cases
+            multi_references=$(ls "${_data}/text.${tgt_case}.${tgt_lang}".* || echo "")
+            if [ "${multi_references}" != "" ]; then
+                case_sensitive_refs=""
+                case_insensitive_refs=""
+                for multi_reference in ${multi_references}; do
+                    ref_idx="${multi_reference##*.}"
+                    paste \
+                        <(<${multi_reference} \
+                            ${python} -m espnet2.bin.tokenize_text  \
+                                -f 2- --input - --output - \
+                                --token_type word \
+                                --non_linguistic_symbols "${nlsyms_txt}" \
+                                --remove_non_linguistic_symbols true \
+                                --cleaner "${cleaner}" \
+                                ) \
+                        <(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \
+                            >"${_scoredir}/ref.trn.org.${ref_idx}"
+
+                    perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/ref.trn.org.${ref_idx}" > "${_scoredir}/ref.trn.${ref_idx}"
+                    detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/ref.trn.${ref_idx}" > "${_scoredir}/ref.trn.detok.${ref_idx}"
+                    remove_punctuation.pl < "${_scoredir}/ref.trn.detok.${ref_idx}" > "${_scoredir}/ref.trn.detok.lc.rm.${ref_idx}"
+                    case_sensitive_refs="${case_sensitive_refs} ${_scoredir}/ref.trn.detok.${ref_idx}"
+                    case_insensitive_refs="${case_insensitive_refs} ${_scoredir}/ref.trn.detok.lc.rm.${ref_idx}"
+                done
+
+                if [ ${tgt_case} = "tc" ]; then
+                    echo "Case sensitive BLEU result (multi-references)" >> ${_scoredir}/result.tc.txt
+                    sacrebleu ${case_sensitive_refs} \
+                        -i ${_scoredir}/hyp.trn.detok.lc.rm -m bleu chrf ter \
+                        >> ${_scoredir}/result.tc.txt
+                    log "Write a case-sensitve BLEU (multi-reference) result in ${_scoredir}/result.tc.txt"
+                fi
+
+                echo "Case insensitive BLEU result (multi-references)" >> ${_scoredir}/result.lc.txt
+                sacrebleu -lc ${case_insensitive_refs} \
+                    -i ${_scoredir}/hyp.trn.detok.lc.rm -m bleu chrf ter \
+                    >> ${_scoredir}/result.lc.txt
+                log "Write a case-insensitve BLEU (multi-reference) result in ${_scoredir}/result.lc.txt"
+            fi
+        done
+
+        # Show results in Markdown syntax
+        scripts/utils/show_translation_result.sh --case $tgt_case "${enh_st_exp}" > "${enh_st_exp}"/RESULTS.md
+        cat "${enh_st_exp}"/RESULTS.md
+
+    fi
+
+    if [ ${stage} -le 15 ] && [ ${stop_stage} -ge 15 ]; then
+        log "Stage 15: Scoring Enhancement"
+        _cmd=${decode_cmd}
+
+        # score_obs=true: Scoring for observation signal
+        # score_obs=false: Scoring for enhanced signal
+        # for score_obs in true false; do
+        for score_obs in true false; do
+            # Peform only at the first time for observation
+            if "${score_obs}" && [ -e "${data_feats}/RESULTS_enh.md" ]; then
+                log "${data_feats}/RESULTS_enh.md already exists. The scoring for observation will be skipped"
+                continue
+            fi
+
+            for dset in ${test_sets}; do
+                _data="${data_feats}/${dset}"
+                if "${score_obs}"; then
+                    _dir="${data_feats}/${dset}/scoring"
+                else
+                    _dir="${enh_st_exp}/${inference_enh_tag}_${dset}/scoring"
+                fi
+
+                _logdir="${_dir}/logdir"
+                mkdir -p "${_logdir}"
+
+                # 1. Split the key file
+                key_file=${_data}/wav.scp
+                split_scps=""
+                _nj=$(min "${inference_nj}" "$(<${key_file} wc -l)")
+                for n in $(seq "${_nj}"); do
+                    split_scps+=" ${_logdir}/keys.${n}.scp"
+                done
+                # shellcheck disable=SC2086
+                utils/split_scp.pl "${key_file}" ${split_scps}
+
+                _ref_scp=
+                for spk in $(seq "${spk_num}"); do
+                    _ref_scp+="--ref_scp ${_data}/spk${spk}.scp "
+                done
+                _inf_scp=
+                for spk in $(seq "${spk_num}"); do
+                    if "${score_obs}"; then
+                        # To compute the score of observation, input original wav.scp
+                        _inf_scp+="--inf_scp ${data_feats}/${dset}/wav.scp "
+                    else
+                        _inf_scp+="--inf_scp ${enh_st_exp}/${inference_enh_tag}_${dset}/spk${spk}.scp "
+                    fi
+                done
+
+                # 2. Submit scoring jobs
+                log "Scoring started... log: '${_logdir}/enh_scoring.*.log'"
+                # shellcheck disable=SC2086
+                ${_cmd} JOB=1:"${_nj}" "${_logdir}"/enh_scoring.JOB.log \
+                    ${python} -m espnet2.bin.enh_scoring \
+                        --key_file "${_logdir}"/keys.JOB.scp \
+                        --output_dir "${_logdir}"/output.JOB \
+                        ${_ref_scp} \
+                        ${_inf_scp} \
+                        --ref_channel ${ref_channel}
+
+                for spk in $(seq "${spk_num}"); do
+                    for protocol in ${scoring_protocol} wav; do
+                        for i in $(seq "${_nj}"); do
+                            cat "${_logdir}/output.${i}/${protocol}_spk${spk}"
+                        done | LC_ALL=C sort -k1 > "${_dir}/${protocol}_spk${spk}"
+                    done
+                done
+
+
+                for protocol in ${scoring_protocol}; do
+                    # shellcheck disable=SC2046
+                    paste $(for j in $(seq ${spk_num}); do echo "${_dir}"/"${protocol}"_spk"${j}" ; done)  |
+                    awk 'BEGIN{sum=0}
+                        {n=0;score=0;for (i=2; i<=NF; i+=2){n+=1;score+=$i}; sum+=score/n}
+                        END{printf ("%.2f\n",sum/NR)}' > "${_dir}/result_${protocol,,}.txt"
+                done
+            done
+
+            ./scripts/utils/show_enh_score.sh "${_dir}/../.." > "${_dir}/../../RESULTS_enh.md"
+        done
+        log "Evaluation result for observation: ${data_feats}/RESULTS_enh.md"
+        log "Evaluation result for enhancement: ${enh_st_exp}/RESULTS_enh.md"
+
+    fi
+else
+    log "Skip the evaluation stages"
+fi
+
+
+packed_model="${enh_st_exp}/${enh_st_exp##*/}_${inference_enh_st_model%.*}.zip"
+if ! "${skip_upload_hf}"; then
+    if [ ${stage} -le 16 ] && [ ${stop_stage} -ge 16 ]; then
+        log "Stage 16: Pack model: ${packed_model}"
+
+        _opts=
+        if "${use_lm}"; then
+            _opts+="--lm_train_config ${lm_exp}/config.yaml "
+            _opts+="--lm_file ${lm_exp}/${inference_lm} "
+            _opts+="--option ${lm_exp}/perplexity_test/ppl "
+            _opts+="--option ${lm_exp}/images "
+        fi
+        if [ "${feats_normalize}" = global_mvn ]; then
+            _opts+="--option ${enh_st_stats_dir}/train/feats_stats.npz "
+        fi
+        if [ "${tgt_token_type}" = bpe ]; then
+            _opts+="--option ${tgt_bpemodel} "
+            _opts+="--option ${src_bpemodel} "
+        fi
+        if [ "${nlsyms_txt}" != none ]; then
+            _opts+="--option ${nlsyms_txt} "
+        fi
+        # shellcheck disable=SC2086
+        ${python} -m espnet2.bin.pack enh_s2t \
+            --enh_s2t_train_config "${enh_st_exp}"/config.yaml \
+            --enh_s2t_model_file "${enh_st_exp}"/"${inference_enh_st_model}" \
+            ${_opts} \
+            --option "${enh_st_exp}"/RESULTS.md \
+            --option "${enh_st_exp}"/RESULTS_enh.md \
+            --option "${enh_st_exp}"/images \
+            --outpath "${packed_model}"
+    fi
+
+
+    if [ ${stage} -le 17 ] && [ ${stop_stage} -ge 17 ]; then
+        [ -z "${hf_repo}" ] && \
+            log "ERROR: You need to setup the variable hf_repo with the name of the repository located at HuggingFace" && \
+            exit 1
+        log "Stage 17: Upload model to HuggingFace: ${hf_repo}"
+
+        gitlfs=$(git lfs --version 2> /dev/null || true)
+        [ -z "${gitlfs}" ] && \
+            log "ERROR: You need to install git-lfs first" && \
+            exit 1             
+  
+        dir_repo=${expdir}/hf_${hf_repo//"/"/"_"}
+        [ ! -d "${dir_repo}" ] && git clone https://huggingface.co/${hf_repo} ${dir_repo}
+  
+        if command -v git &> /dev/null; then
+            _creator_name="$(git config user.name)"
+            _checkout="git checkout $(git show -s --format=%H)"
+        else
+            _creator_name="$(whoami)"
+            _checkout=""
+        fi
+        # /some/where/espnet/egs2/foo/asr1/ -> foo/asr1
+        _task="$(pwd | rev | cut -d/ -f2 | rev)"
+        # foo/asr1 -> foo
+        _corpus="${_task%/*}"
+        _model_name="${_creator_name}/${_corpus}_$(basename ${packed_model} .zip)"
+  
+        # copy files in ${dir_repo}
+        unzip -o ${packed_model} -d ${dir_repo}
+        # Generate description file
+        # shellcheck disable=SC2034
+        hf_task=speech-enhancement-translation
+        # shellcheck disable=SC2034     
+        espnet_task=EnhS2T
+        # shellcheck disable=SC2034
+        task_exp=${enh_st_exp}
+        eval "echo \"$(cat scripts/utils/TEMPLATE_HF_Readme.md)\"" > "${dir_repo}"/README.md
+
+        this_folder=${PWD}
+        cd ${dir_repo}
+        if [ -n "$(git status --porcelain)" ]; then
+            git add .
+            git commit -m "Update model"
+        fi
+        git push
+        cd ${this_folder}
+    fi
+else
+    log "Skip the uploading stages"
+fi
+
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/TEMPLATE/enh_st1/local/path.sh b/egs2/TEMPLATE/enh_st1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/TEMPLATE/enh_st1/path.sh b/egs2/TEMPLATE/enh_st1/path.sh
new file mode 100755
index 00000000000..d2b90a67653
--- /dev/null
+++ b/egs2/TEMPLATE/enh_st1/path.sh
@@ -0,0 +1,22 @@
+MAIN_ROOT=$PWD/../../..
+KALDI_ROOT=$MAIN_ROOT/tools/kaldi
+
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+
+. "${MAIN_ROOT}"/tools/activate_python.sh && . "${MAIN_ROOT}"/tools/extra_path.sh
+export PATH=$MAIN_ROOT/utils:$MAIN_ROOT/espnet/bin:$PATH
+
+export OMP_NUM_THREADS=1
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+
+# You need to change or unset NCCL_SOCKET_IFNAME according to your network environment
+# https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/env.html#nccl-socket-ifname
+export NCCL_SOCKET_IFNAME="^lo,docker,virbr,vmnet,vboxnet"
+
+# NOTE(kamo): Source at the last to overwrite the setting
+. local/path.sh
diff --git a/egs2/TEMPLATE/enh_st1/pyscripts b/egs2/TEMPLATE/enh_st1/pyscripts
new file mode 120000
index 00000000000..90e7cf60b04
--- /dev/null
+++ b/egs2/TEMPLATE/enh_st1/pyscripts
@@ -0,0 +1 @@
+../asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_st1/scripts b/egs2/TEMPLATE/enh_st1/scripts
new file mode 120000
index 00000000000..b2cb12d9a6c
--- /dev/null
+++ b/egs2/TEMPLATE/enh_st1/scripts
@@ -0,0 +1 @@
+../enh_asr1/scripts
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_st1/setup.sh b/egs2/TEMPLATE/enh_st1/setup.sh
new file mode 100755
index 00000000000..b69c326c340
--- /dev/null
+++ b/egs2/TEMPLATE/enh_st1/setup.sh
@@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+help_message=$(cat << EOF
+Usage: $0 <target-dir>
+EOF
+)
+
+
+if [ $# -ne 1 ]; then
+    log "${help_message}"
+    log "Error: 1 positional argument is required."
+    exit 2
+fi
+
+
+dir=$1
+mkdir -p "${dir}"
+
+if [ ! -d "${dir}"/../../TEMPLATE ]; then
+    log "Error: ${dir}/../../TEMPLATE should exist. You may specify wrong directory."
+    exit 1
+fi
+
+targets=""
+
+# Copy
+for f in cmd.sh conf local; do
+    target="${dir}"/../../TEMPLATE/enh_st1/"${f}"
+    cp -r "${target}" "${dir}"
+    targets+="${dir}/${target} "
+done
+
+
+# Symlinks to TEMPLATE
+for f in st.sh path.sh db.sh scripts pyscripts; do
+    target=../../TEMPLATE/enh_st1/"${f}"
+    ln -sf "${target}" "${dir}"
+    targets+="${dir}/${target} "
+done
+
+
+# Symlinks to Kaldi
+for f in steps utils; do
+    target=../../../tools/kaldi/egs/wsj/s5/"${f}"
+    ln -sf "${target}" "${dir}"
+    targets+="${dir}/${target} "
+done
+
+log "Created: ${targets}"
diff --git a/egs2/TEMPLATE/enh_st1/steps b/egs2/TEMPLATE/enh_st1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/TEMPLATE/enh_st1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_st1/utils b/egs2/TEMPLATE/enh_st1/utils
new file mode 120000
index 00000000000..6d93948f170
--- /dev/null
+++ b/egs2/TEMPLATE/enh_st1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils/
\ No newline at end of file
diff --git a/egs2/TEMPLATE/mt1/cmd.sh b/egs2/TEMPLATE/mt1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/TEMPLATE/mt1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/TEMPLATE/mt1/conf/pbs.conf b/egs2/TEMPLATE/mt1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/TEMPLATE/mt1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/TEMPLATE/mt1/conf/queue.conf b/egs2/TEMPLATE/mt1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/TEMPLATE/mt1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/TEMPLATE/mt1/conf/slurm.conf b/egs2/TEMPLATE/mt1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/TEMPLATE/mt1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/TEMPLATE/mt1/db.sh b/egs2/TEMPLATE/mt1/db.sh
new file mode 120000
index 00000000000..318d781d123
--- /dev/null
+++ b/egs2/TEMPLATE/mt1/db.sh
@@ -0,0 +1 @@
+../asr1/db.sh
\ No newline at end of file
diff --git a/egs2/TEMPLATE/mt1/local/path.sh b/egs2/TEMPLATE/mt1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/TEMPLATE/mt1/mt.sh b/egs2/TEMPLATE/mt1/mt.sh
new file mode 100755
index 00000000000..bf6996c13c8
--- /dev/null
+++ b/egs2/TEMPLATE/mt1/mt.sh
@@ -0,0 +1,1431 @@
+#!/usr/bin/env bash
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+min() {
+  local a b
+  a=$1
+  for b in "$@"; do
+      if [ "${b}" -le "${a}" ]; then
+          a="${b}"
+      fi
+  done
+  echo "${a}"
+}
+SECONDS=0
+
+# General configuration
+stage=1              # Processes starts from the specified stage.
+stop_stage=10000     # Processes is stopped at the specified stage.
+skip_data_prep=false # Skip data preparation stages.
+skip_train=false     # Skip training stages.
+skip_eval=false      # Skip decoding and evaluation stages.
+skip_upload=true     # Skip packing and uploading stages.
+skip_upload_hf=true  # Skip uploading to hugging face stages.
+ngpu=1               # The number of gpus ("0" uses cpu, otherwise use gpu).
+num_nodes=1          # The number of nodes.
+nj=32                # The number of parallel jobs.
+inference_nj=32      # The number of parallel jobs in decoding.
+gpu_inference=false  # Whether to perform gpu decoding.
+dumpdir=dump         # Directory to dump features.
+expdir=exp           # Directory to save experiments.
+python=python3       # Specify python to execute espnet commands.
+
+# Data preparation related
+local_data_opts= # The options given to local/data.sh.
+
+# Feature extraction related
+feats_type=raw       # Feature type (raw or fbank_pitch).
+
+# Tokenization related
+oov="<unk>"         # Out of vocabulary symbol.
+blank="<blank>"     # CTC blank symbol
+sos_eos="<sos/eos>" # sos and eos symbole
+token_joint=false       # whether to use a single bpe system for both source and target languages
+src_case=lc.rm
+src_token_type=bpe      # Tokenization type (char or bpe) for source languages.
+src_nbpe=30             # The number of BPE vocabulary for source language.
+src_bpemode=unigram     # Mode of BPE for source language (unigram or bpe).
+src_bpe_input_sentence_size=100000000 # Size of input sentence for BPE for source language.
+src_bpe_nlsyms=         # non-linguistic symbols list, separated by a comma, for BPE of source language
+src_bpe_char_cover=1.0  # character coverage when modeling BPE for source language
+tgt_case=tc
+tgt_token_type=bpe      # Tokenization type (char or bpe) for target language.
+tgt_nbpe=30             # The number of BPE vocabulary for target language.
+tgt_bpemode=unigram     # Mode of BPE (unigram or bpe) for target language.
+tgt_bpe_input_sentence_size=100000000 # Size of input sentence for BPE for target language.
+tgt_bpe_nlsyms=         # non-linguistic symbols list, separated by a comma, for BPE for target language.
+tgt_bpe_char_cover=1.0  # character coverage when modeling BPE for target language.
+
+# Ngram model related
+use_ngram=false
+ngram_exp=
+ngram_num=3
+
+# Language model related
+use_lm=true       # Use language model for MT decoding.
+lm_tag=           # Suffix to the result dir for language model training.
+lm_exp=           # Specify the directory path for LM experiment.
+                  # If this option is specified, lm_tag is ignored.
+lm_stats_dir=     # Specify the directory path for LM statistics.
+lm_config=        # Config for language model training.
+lm_args=          # Arguments for language model training, e.g., "--max_epoch 10".
+                  # Note that it will overwrite args in lm config.
+use_word_lm=false # Whether to use word language model.
+num_splits_lm=1   # Number of splitting for lm corpus.
+# shellcheck disable=SC2034
+word_vocab_size=10000 # Size of word vocabulary.
+
+# MT model related
+mt_tag=        # Suffix to the result dir for mt model training.
+mt_exp=        # Specify the directory path for MT experiment.
+               # If this option is specified, mt_tag is ignored.
+mt_stats_dir=  # Specify the directory path for MT statistics.
+mt_config=     # Config for mt model training.
+mt_args=       # Arguments for mt model training, e.g., "--max_epoch 10".
+               # Note that it will overwrite args in mt config.
+ignore_init_mismatch=false      # Ignore initial mismatch
+num_splits_mt=1            # Number of splitting for lm corpus.
+src_lang=es                # source language abbrev. id (e.g., es)
+tgt_lang=en                # target language abbrev. id (e.g., en)
+
+# Upload model related
+hf_repo=
+
+# Decoding related
+use_k2=false      # Whether to use k2 based decoder
+batch_size=1
+inference_tag=    # Suffix to the result dir for decoding.
+inference_config= # Config for decoding.
+inference_args=   # Arguments for decoding, e.g., "--lm_weight 0.1".
+                  # Note that it will overwrite args in inference config.
+inference_lm=valid.loss.ave.pth       # Language model path for decoding.
+inference_ngram=${ngram_num}gram.bin
+inference_mt_model=valid.acc.ave.pth # MT model path for decoding.
+                                      # e.g.
+                                      # inference_mt_model=train.loss.best.pth
+                                      # inference_mt_model=3epoch.pth
+                                      # inference_mt_model=valid.acc.best.pth
+                                      # inference_mt_model=valid.loss.ave.pth
+download_model= # Download a model from Model Zoo and use it for decoding.
+
+# [Task dependent] Set the datadir name created by local/data.sh
+train_set=       # Name of training set.
+valid_set=       # Name of validation set used for monitoring/tuning network training.
+test_sets=       # Names of test sets. Multiple items (e.g., both dev and eval sets) can be specified.
+src_bpe_train_text=  # Text file path of bpe training set for source language.
+tgt_bpe_train_text=  # Text file path of bpe training set for target language.
+lm_train_text=   # Text file path of language model training set.
+lm_dev_text=     # Text file path of language model development set.
+lm_test_text=    # Text file path of language model evaluation set.
+nlsyms_txt=none  # Non-linguistic symbol list if existing.
+cleaner=none     # Text cleaner.
+g2p=none         # g2p method (needed if token_type=phn).
+lang=noinfo      # The language type of corpus.
+score_opts=                # The options given to sclite scoring
+local_score_opts=          # The options given to local/score.sh.
+mt_text_fold_length=150   # fold_length for text data during MT training.
+lm_fold_length=150         # fold_length for LM training.
+
+help_message=$(cat << EOF
+Usage: $0 --train-set "<train_set_name>" --valid-set "<valid_set_name>" --test_sets "<test_set_names>"
+
+Options:
+    # General configuration
+    --stage          # Processes starts from the specified stage (default="${stage}").
+    --stop_stage     # Processes is stopped at the specified stage (default="${stop_stage}").
+    --skip_data_prep # Skip data preparation stages (default="${skip_data_prep}").
+    --skip_train     # Skip training stages (default="${skip_train}").
+    --skip_eval      # Skip decoding and evaluation stages (default="${skip_eval}").
+    --skip_upload    # Skip packing and uploading stages (default="${skip_upload}").
+    --ngpu           # The number of gpus ("0" uses cpu, otherwise use gpu, default="${ngpu}").
+    --num_nodes      # The number of nodes (default="${num_nodes}").
+    --nj             # The number of parallel jobs (default="${nj}").
+    --inference_nj   # The number of parallel jobs in decoding (default="${inference_nj}").
+    --gpu_inference  # Whether to perform gpu decoding (default="${gpu_inference}").
+    --dumpdir        # Directory to dump features (default="${dumpdir}").
+    --expdir         # Directory to save experiments (default="${expdir}").
+    --python         # Specify python to execute espnet commands (default="${python}").
+
+    # Data preparation related
+    --local_data_opts # The options given to local/data.sh (default="${local_data_opts}").
+
+    # Feature extraction related
+    --feats_type       # Feature type (raw, fbank_pitch or extracted, default="${feats_type}").
+
+    # Tokenization related
+    --oov                     # Out of vocabulary symbol (default="${oov}").
+    --blank                   # CTC blank symbol (default="${blank}").
+    --sos_eos                 # sos and eos symbole (default="${sos_eos}").
+    --token_joint=false       # Whether to use a single bpe system for both source and target languages.
+                              # if set as true, will use tgt_* for processing (default="${token_joint}").
+    --src_token_type=bpe      # Tokenization type (char or bpe) for source languages. (default="${src_token_type}").
+    --src_nbpe=30             # The number of BPE vocabulary for source language. (default="${src_nbpe}").
+    --src_bpemode=unigram     # Mode of BPE for source language (unigram or bpe). (default="${src_bpemode}").
+    --src_bpe_input_sentence_size=100000000 # Size of input sentence for BPE for source language. (default="${src_bpe_input_sentence_size}").
+    --src_bpe_nlsyms=         # Non-linguistic symbols list, separated by a comma, for BPE of source language. (default="${src_bpe_nlsyms}").
+    --src_bpe_char_cover=1.0  # Character coverage when modeling BPE for source language. (default="${src_bpe_char_cover}").
+    --tgt_token_type=bpe      # Tokenization type (char or bpe) for target language. (default="${tgt_token_type}").
+    --tgt_nbpe=30             # The number of BPE vocabulary for target language. (default="${tgt_nbpe}").
+    --tgt_bpemode=unigram     # Mode of BPE (unigram or bpe) for target language. (default="${tgt_bpemode}").
+    --tgt_bpe_input_sentence_size=100000000 # Size of input sentence for BPE for target language. (default="${tgt_bpe_input_sentence_size}").
+    --tgt_bpe_nlsyms=         # Non-linguistic symbols list, separated by a comma, for BPE for target language. (default="${tgt_bpe_nlsyms}").
+    --tgt_bpe_char_cover=1.0  # Character coverage when modeling BPE for target language. (default="${tgt_bpe_char_cover}").
+
+    # Language model related
+    --lm_tag          # Suffix to the result dir for language model training (default="${lm_tag}").
+    --lm_exp          # Specify the directory path for LM experiment.
+                      # If this option is specified, lm_tag is ignored (default="${lm_exp}").
+    --lm_stats_dir    # Specify the directory path for LM statistics (default="${lm_stats_dir}").
+    --lm_config       # Config for language model training (default="${lm_config}").
+    --lm_args         # Arguments for language model training (default="${lm_args}").
+                      # e.g., --lm_args "--max_epoch 10"
+                      # Note that it will overwrite args in lm config.
+    --use_word_lm     # Whether to use word language model (default="${use_word_lm}").
+    --word_vocab_size # Size of word vocabulary (default="${word_vocab_size}").
+    --num_splits_lm   # Number of splitting for lm corpus (default="${num_splits_lm}").
+
+    # MT model related
+    --mt_tag           # Suffix to the result dir for mt model training (default="${mt_tag}").
+    --mt_exp           # Specify the directory path for MT experiment.
+                       # If this option is specified, mt_tag is ignored (default="${mt_exp}").
+    --mt_stats_dir     # Specify the directory path for MT statistics (default="${mt_stats_dir}").
+    --mt_config        # Config for mt model training (default="${mt_config}").
+    --mt_args          # Arguments for mt model training (default="${mt_args}").
+                       # e.g., --mt_args "--max_epoch 10"
+                       # Note that it will overwrite args in mt config.
+    --ignore_init_mismatch=      # Ignore mismatch parameter init with pretrained model (default="${ignore_init_mismatch}").
+    --num_splits_mt    # Number of splitting for lm corpus.  (default="${num_splits_mt}").
+    --src_lang=        # source language abbrev. id (e.g., es). (default="${src_lang}")
+    --tgt_lang=        # target language abbrev. id (e.g., en). (default="${tgt_lang}")
+
+    # Decoding related
+    --inference_tag       # Suffix to the result dir for decoding (default="${inference_tag}").
+    --inference_config    # Config for decoding (default="${inference_config}").
+    --inference_args      # Arguments for decoding (default="${inference_args}").
+                          # e.g., --inference_args "--lm_weight 0.1"
+                          # Note that it will overwrite args in inference config.
+    --inference_lm        # Language model path for decoding (default="${inference_lm}").
+    --inference_mt_model # MT model path for decoding (default="${inference_mt_model}").
+    --download_model      # Download a model from Model Zoo and use it for decoding (default="${download_model}").
+
+    # [Task dependent] Set the datadir name created by local/data.sh
+    --train_set     # Name of training set (required).
+    --valid_set     # Name of validation set used for monitoring/tuning network training (required).
+    --test_sets     # Names of test sets.
+                    # Multiple items (e.g., both dev and eval sets) can be specified (required).
+    --src_bpe_train_text # Text file path of bpe training set for source language.
+    --tgt_bpe_train_text # Text file path of bpe training set for target language
+    --lm_train_text  # Text file path of language model training set.
+    --lm_dev_text   # Text file path of language model development set (default="${lm_dev_text}").
+    --lm_test_text  # Text file path of language model evaluation set (default="${lm_test_text}").
+    --nlsyms_txt    # Non-linguistic symbol list if existing (default="${nlsyms_txt}").
+    --cleaner       # Text cleaner (default="${cleaner}").
+    --g2p           # g2p method (default="${g2p}").
+    --lang          # The language type of corpus (default=${lang}).
+    --score_opts             # The options given to sclite scoring (default="{score_opts}").
+    --local_score_opts       # The options given to local/score.sh (default="{local_score_opts}").
+    --mt_text_fold_length   # fold_length for text data during MT training (default="${mt_text_fold_length}").
+    --lm_fold_length         # fold_length for LM training (default="${lm_fold_length}").
+EOF
+)
+
+log "$0 $*"
+# Save command line args for logging (they will be lost after utils/parse_options.sh)
+run_args=$(pyscripts/utils/print_args.py $0 "$@")
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    log "${help_message}"
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+. ./path.sh
+. ./cmd.sh
+
+
+# Check required arguments
+[ -z "${train_set}" ] && { log "${help_message}"; log "Error: --train_set is required"; exit 2; };
+[ -z "${valid_set}" ] && { log "${help_message}"; log "Error: --valid_set is required"; exit 2; };
+[ -z "${test_sets}" ] && { log "${help_message}"; log "Error: --test_sets is required"; exit 2; };
+
+# Check feature type
+if [ "${feats_type}" = raw ]; then
+    data_feats=${dumpdir}/raw
+else
+    log "${help_message}"
+    log "Error: not supported: --feats_type ${feats_type}"
+    exit 2
+fi
+
+# Extra files for translation process
+utt_extra_files="text.${src_case}.${src_lang} text.${tgt_case}.${tgt_lang}"
+# Use the same text as MT for bpe training if not specified.
+[ -z "${src_bpe_train_text}" ] && src_bpe_train_text="${data_feats}/${train_set}/text.${src_case}.${src_lang}"
+[ -z "${tgt_bpe_train_text}" ] && tgt_bpe_train_text="${data_feats}/${train_set}/text.${tgt_case}.${tgt_lang}"
+# Use the same text as MT for lm training if not specified.
+[ -z "${lm_train_text}" ] && lm_train_text="${data_feats}/${train_set}/text.${tgt_case}.${tgt_lang}"
+# Use the same text as MT for lm training if not specified.
+[ -z "${lm_dev_text}" ] && lm_dev_text="${data_feats}/${valid_set}/text.${tgt_case}.${tgt_lang}"
+# Use the text of the 1st evaldir if lm_test is not specified
+[ -z "${lm_test_text}" ] && lm_test_text="${data_feats}/${test_sets%% *}/text.${tgt_case}.${tgt_lang}"
+
+# Check tokenization type
+if [ "${lang}" != noinfo ]; then
+    token_listdir=data/${lang}_token_list
+else
+    token_listdir=data/token_list
+fi
+# The tgt bpedir is set for all cases when using bpe
+tgt_bpedir="${token_listdir}/tgt_bpe_${tgt_bpemode}${tgt_nbpe}"
+tgt_bpeprefix="${tgt_bpedir}"/bpe
+tgt_bpemodel="${tgt_bpeprefix}".model
+tgt_bpetoken_list="${tgt_bpedir}"/tokens.txt
+tgt_chartoken_list="${token_listdir}"/char/tgt_tokens.txt
+if "${token_joint}"; then
+    # if token_joint, the bpe training will use both src_lang and tgt_lang to train a single bpe model
+    src_bpedir="${tgt_bpedir}"
+    src_bpeprefix="${tgt_bpeprefix}"
+    src_bpemodel="${tgt_bpemodel}"
+    src_bpetoken_list="${tgt_bpetoken_list}"
+    src_chartoken_list="${tgt_chartoken_list}"
+else
+    src_bpedir="${token_listdir}/src_bpe_${src_bpemode}${src_nbpe}"
+    src_bpeprefix="${src_bpedir}"/bpe
+    src_bpemodel="${src_bpeprefix}".model
+    src_bpetoken_list="${src_bpedir}"/tokens.txt
+    src_chartoken_list="${token_listdir}"/char/src_tokens.txt
+fi
+
+# NOTE: keep for future development.
+# shellcheck disable=SC2034
+tgt_wordtoken_list="${token_listdir}"/word/tgt_tokens.txt
+if "${token_joint}"; then
+    src_wordtoken_list="${tgt_wordtoken_list}"
+else
+    src_wordtoken_list="${token_listdir}"/word/src_tokens.txt
+fi
+
+# Set token types for src and tgt langs
+if [ "${src_token_type}" = bpe ]; then
+    src_token_list="${src_bpetoken_list}"
+elif [ "${src_token_type}" = char ]; then
+    src_token_list="${src_chartoken_list}"
+    src_bpemodel=none
+elif [ "${src_token_type}" = word ]; then
+    src_token_list="${src_wordtoken_list}"
+    src_bpemodel=none
+else
+    log "Error: not supported --src_token_type '${src_token_type}'"
+    exit 2
+fi
+if [ "${tgt_token_type}" = bpe ]; then
+    tgt_token_list="${tgt_bpetoken_list}"
+elif [ "${tgt_token_type}" = char ]; then
+    tgt_token_list="${tgt_chartoken_list}"
+    tgt_bpemodel=none
+elif [ "${tgt_token_type}" = word ]; then
+    tgt_token_list="${tgt_wordtoken_list}"
+    tgt_bpemodel=none
+else
+    log "Error: not supported --tgt_token_type '${tgt_token_type}'"
+    exit 2
+fi
+if ${use_word_lm}; then
+    log "Error: Word LM is not supported yet"
+    exit 2
+
+    lm_token_list="${tgt_wordtoken_list}"
+    lm_token_type=word
+else
+    lm_token_list="${tgt_token_list}"
+    lm_token_type="${tgt_token_type}"
+fi
+
+
+# Set tag for naming of model directory
+if [ -z "${mt_tag}" ]; then
+    if [ -n "${mt_config}" ]; then
+        mt_tag="$(basename "${mt_config}" .yaml)_${feats_type}"
+    else
+        mt_tag="train_${feats_type}"
+    fi
+    if [ "${lang}" != noinfo ]; then
+        mt_tag+="_${lang}_${tgt_token_type}_${tgt_case}"
+    else
+        mt_tag+="_${tgt_token_type}_${tgt_case}"
+    fi
+    if [ "${tgt_token_type}" = bpe ]; then
+        mt_tag+="${tgt_nbpe}"
+    fi
+    # Add overwritten arg's info
+    if [ -n "${mt_args}" ]; then
+        mt_tag+="$(echo "${mt_args}" | sed -e "s/--/\_/g" -e "s/[ |=/]//g")"
+    fi
+fi
+if [ -z "${lm_tag}" ]; then
+    if [ -n "${lm_config}" ]; then
+        lm_tag="$(basename "${lm_config}" .yaml)"
+    else
+        lm_tag="train"
+    fi
+    if [ "${lang}" != noinfo ]; then
+        lm_tag+="_${lang}_${lm_token_type}"
+    else
+        lm_tag+="_${lm_token_type}"
+    fi
+    if [ "${lm_token_type}" = bpe ]; then
+        lm_tag+="${tgt_nbpe}"
+    fi
+    # Add overwritten arg's info
+    if [ -n "${lm_args}" ]; then
+        lm_tag+="$(echo "${lm_args}" | sed -e "s/--/\_/g" -e "s/[ |=/]//g")"
+    fi
+fi
+
+# The directory used for collect-stats mode
+if [ -z "${mt_stats_dir}" ]; then
+    if [ "${lang}" != noinfo ]; then
+        mt_stats_dir="${expdir}/mt_stats_${feats_type}_${lang}_${tgt_token_type}"
+    else
+        mt_stats_dir="${expdir}/mt_stats_${feats_type}_${tgt_token_type}"
+    fi
+    if [ "${tgt_token_type}" = bpe ]; then
+        mt_stats_dir+="${tgt_nbpe}"
+    fi
+fi
+if [ -z "${lm_stats_dir}" ]; then
+    if [ "${lang}" != noinfo ]; then
+        lm_stats_dir="${expdir}/lm_stats_${lang}_${lm_token_type}"
+    else
+        lm_stats_dir="${expdir}/lm_stats_${lm_token_type}"
+    fi
+    if [ "${lm_token_type}" = bpe ]; then
+        lm_stats_dir+="${tgt_nbpe}"
+    fi
+fi
+# The directory used for training commands
+if [ -z "${mt_exp}" ]; then
+    mt_exp="${expdir}/mt_${mt_tag}"
+fi
+if [ -z "${lm_exp}" ]; then
+    lm_exp="${expdir}/lm_${lm_tag}"
+fi
+if [ -z "${ngram_exp}" ]; then
+    ngram_exp="${expdir}/ngram"
+fi
+
+
+if [ -z "${inference_tag}" ]; then
+    if [ -n "${inference_config}" ]; then
+        inference_tag="$(basename "${inference_config}" .yaml)"
+    else
+        inference_tag=inference
+    fi
+    # Add overwritten arg's info
+    if [ -n "${inference_args}" ]; then
+        inference_tag+="$(echo "${inference_args}" | sed -e "s/--/\_/g" -e "s/[ |=]//g")"
+    fi
+    if "${use_lm}"; then
+        inference_tag+="_lm_$(basename "${lm_exp}")_$(echo "${inference_lm}" | sed -e "s/\//_/g" -e "s/\.[^.]*$//g")"
+    fi
+    if "${use_ngram}"; then
+        inference_tag+="_ngram_$(basename "${ngram_exp}")_$(echo "${inference_ngram}" | sed -e "s/\//_/g" -e "s/\.[^.]*$//g")"
+    fi
+    inference_tag+="_mt_model_$(echo "${inference_mt_model}" | sed -e "s/\//_/g" -e "s/\.[^.]*$//g")"
+
+    if "${use_k2}"; then
+      inference_tag+="_use_k2"
+    fi
+fi
+
+# ========================== Main stages start from here. ==========================
+
+if ! "${skip_data_prep}"; then
+    if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+        log "Stage 1: Data preparation for data/${train_set}, data/${valid_set}, etc."
+        # [Task dependent] Need to create data.sh for new corpus
+        local/data.sh ${local_data_opts}
+        
+    fi
+
+    if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+        if [ "${feats_type}" = raw ]; then
+            log "Stage 2: data/ -> ${data_feats}"
+
+            for dset in "${train_set}" "${valid_set}" ${test_sets}; do
+                if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
+                    _suf="/org"
+                else
+                    _suf=""
+                fi
+                mkdir -p "${data_feats}${_suf}/${dset}"
+
+                for extra_file in ${utt_extra_files}; do
+                    # with regex to suuport multi-references
+                    for single_file in $(ls data/"${dset}"/${extra_file}*); do
+                        cp ${single_file} "${data_feats}${_suf}/${dset}"
+                    done 
+                done
+                echo "${feats_type}" > "${data_feats}${_suf}/${dset}/feats_type"
+            done
+        else
+            log "Error: not supported: --feats_type ${feats_type}"
+            exit 2
+        fi
+    fi
+
+
+    if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+        log "Stage 3: Data filtering: ${data_feats}/org -> ${data_feats}"
+
+        # NOTE(kamo): Not applying to test_sets to keep original data
+        for dset in "${train_set}" "${valid_set}"; do
+            # Copy data dir
+            mkdir -p "${data_feats}/${dset}"
+            cp "${data_feats}/org/${dset}/feats_type" "${data_feats}/${dset}/feats_type"
+
+            for utt_extra_file in ${utt_extra_files}; do
+                cp "${data_feats}/org/${dset}/${utt_extra_file}" "${data_feats}/${dset}"
+            done
+            # TODO: Maybe Remove empty text
+            # TODO: Add other data cleaning -- currently being done as part of data.sh
+        done
+
+        # shellcheck disable=SC2002
+        cat ${lm_train_text} | awk ' { if( NF != 1 ) print $0; } ' > "${data_feats}/lm_train.txt"
+    fi
+
+
+    if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+
+        if "${token_joint}"; then
+            log "Merge src and target data if joint BPE"
+
+            cat $tgt_bpe_train_text > ${data_feats}/${train_set}/text.${src_lang}_${tgt_lang}
+            [ ! -z "${src_bpe_train_text}" ] && cat ${src_bpe_train_text} >> ${data_feats}/${train_set}/text.${src_lang}_${tgt_lang}
+            # Set the new text as the target text
+            tgt_bpe_train_text="${data_feats}/${train_set}/text.${src_lang}_${tgt_lang}"
+        fi
+
+        # First generate tgt lang
+        if [ "${tgt_token_type}" = bpe ]; then
+            log "Stage 4a: Generate token_list from ${tgt_bpe_train_text} using BPE for tgt_lang"
+
+            mkdir -p "${tgt_bpedir}"
+            # shellcheck disable=SC2002
+            cat ${tgt_bpe_train_text} | cut -f 2- -d" "  > "${tgt_bpedir}"/train.txt
+
+            if [ -n "${tgt_bpe_nlsyms}" ]; then
+                _opts_spm="--user_defined_symbols=${tgt_bpe_nlsyms}"
+            else
+                _opts_spm=""
+            fi
+
+            spm_train \
+                --input="${tgt_bpedir}"/train.txt \
+                --vocab_size="${tgt_nbpe}" \
+                --model_type="${tgt_bpemode}" \
+                --model_prefix="${tgt_bpeprefix}" \
+                --character_coverage=${tgt_bpe_char_cover} \
+                --input_sentence_size="${tgt_bpe_input_sentence_size}" \
+                ${_opts_spm}
+
+            {
+            echo "${blank}"
+            echo "${oov}"
+            # Remove <unk>, <s>, </s> from the vocabulary
+            <"${tgt_bpeprefix}".vocab awk '{ if( NR != 1 && NR != 2 && NR != 3 ){ print $1; } }'
+            echo "${sos_eos}"
+            } > "${tgt_token_list}"
+
+        elif [ "${tgt_token_type}" = char ] || [ "${tgt_token_type}" = word ]; then
+            log "Stage 4a: Generate character level token_list from ${tgt_bpe_train_text}  for tgt_lang"
+
+            _opts="--non_linguistic_symbols ${nlsyms_txt}"
+
+            # shellcheck disable=SC2002
+            cat ${tgt_bpe_train_text} | cut -f 2- -d" "  > "${data_feats}"/token_train.txt
+
+            # The first symbol in token_list must be "<blank>" and the last must be also sos/eos:
+            # 0 is reserved for CTC-blank for MT and also used as ignore-index in the other task
+            ${python} -m espnet2.bin.tokenize_text  \
+                --token_type "${tgt_token_type}" \
+                --input "${data_feats}/token_train.txt" --output "${tgt_token_list}" ${_opts} \
+                --field 2- \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --write_vocabulary true \
+                --add_symbol "${blank}:0" \
+                --add_symbol "${oov}:1" \
+                --add_symbol "${sos_eos}:-1"
+
+        else
+            log "Error: not supported --token_type '${tgt_token_type}'"
+            exit 2
+        fi
+
+        # Create word-list for word-LM training
+        if ${use_word_lm} && [ "${tgt_token_type}" != word ]; then
+            log "Generate word level token_list from ${data_feats}/lm_train.txt"
+            ${python} -m espnet2.bin.tokenize_text \
+                --token_type word \
+                --input "${data_feats}/lm_train.txt" --output "${lm_token_list}" \
+                --field 2- \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --write_vocabulary true \
+                --vocabulary_size "${word_vocab_size}" \
+                --add_symbol "${blank}:0" \
+                --add_symbol "${oov}:1" \
+                --add_symbol "${sos_eos}:-1"
+        fi
+
+        # Then generate src lang
+        if "${token_joint}"; then
+            log "Stage 4b: Skip separate token construction for src_lang when setting ${token_joint} as true"
+        else
+            if [ "${src_token_type}" = bpe ]; then
+                log "Stage 4b: Generate token_list from ${src_bpe_train_text} using BPE for src_lang"
+
+                mkdir -p "${src_bpedir}"
+                # shellcheck disable=SC2002
+                cat ${src_bpe_train_text} | cut -f 2- -d" "  > "${src_bpedir}"/train.txt
+
+                if [ -n "${src_bpe_nlsyms}" ]; then
+                    _opts_spm="--user_defined_symbols=${src_bpe_nlsyms}"
+                else
+                    _opts_spm=""
+                fi
+
+                spm_train \
+                    --input="${src_bpedir}"/train.txt \
+                    --vocab_size="${src_nbpe}" \
+                    --model_type="${src_bpemode}" \
+                    --model_prefix="${src_bpeprefix}" \
+                    --character_coverage=${src_bpe_char_cover} \
+                    --input_sentence_size="${src_bpe_input_sentence_size}" \
+                    ${_opts_spm}
+
+                {
+                echo "${blank}"
+                echo "${oov}"
+                # Remove <unk>, <s>, </s> from the vocabulary
+                <"${src_bpeprefix}".vocab awk '{ if( NR != 1 && NR != 2 && NR != 3 ){ print $1; } }'
+                echo "${sos_eos}"
+                } > "${src_token_list}"
+
+            elif [ "${src_token_type}" = char ] || [ "${src_token_type}" = word ]; then
+                log "Stage 4b: Generate character level token_list from ${src_bpe_train_text}  for src_lang"
+
+                _opts="--non_linguistic_symbols ${nlsyms_txt}"
+
+                # shellcheck disable=SC2002
+                cat ${src_bpe_train_text} | cut -f 2- -d" "  > "${data_feats}"/token_train.txt
+
+                # The first symbol in token_list must be "<blank>" and the last must be also sos/eos:
+                # 0 is reserved for CTC-blank for MT and also used as ignore-index in the other task
+                ${python} -m espnet2.bin.tokenize_text  \
+                    --token_type "${src_token_type}" \
+                    --input "${data_feats}/token_train.txt" --output "${src_token_list}" ${_opts} \
+                    --field 2- \
+                    --cleaner "${cleaner}" \
+                    --g2p "${g2p}" \
+                    --write_vocabulary true \
+                    --add_symbol "${blank}:0" \
+                    --add_symbol "${oov}:1" \
+                    --add_symbol "${sos_eos}:-1"
+
+            else
+                log "Error: not supported --token_type '${src_token_type}'"
+                exit 2
+            fi
+        fi
+    fi
+
+else
+    log "Skip the stages for data preparation"
+fi
+
+
+# ========================== Data preparation is done here. ==========================
+
+
+if ! "${skip_train}"; then
+    if "${use_lm}"; then
+        if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+            log "Stage 5: LM collect stats: train_set=${data_feats}/lm_train.txt, dev_set=${lm_dev_text}"
+
+            _opts=
+            if [ -n "${lm_config}" ]; then
+                # To generate the config file: e.g.
+                #   % python3 -m espnet2.bin.lm_train --print_config --optim adam
+                _opts+="--config ${lm_config} "
+            fi
+
+            # 1. Split the key file
+            _logdir="${lm_stats_dir}/logdir"
+            mkdir -p "${_logdir}"
+            # Get the minimum number among ${nj} and the number lines of input files
+            _nj=$(min "${nj}" "$(<${data_feats}/lm_train.txt wc -l)" "$(<${lm_dev_text} wc -l)")
+
+            key_file="${data_feats}/lm_train.txt"
+            split_scps=""
+            for n in $(seq ${_nj}); do
+                split_scps+=" ${_logdir}/train.${n}.scp"
+            done
+            # shellcheck disable=SC2086
+            utils/split_scp.pl "${key_file}" ${split_scps}
+
+            key_file="${lm_dev_text}"
+            split_scps=""
+            for n in $(seq ${_nj}); do
+                split_scps+=" ${_logdir}/dev.${n}.scp"
+            done
+            # shellcheck disable=SC2086
+            utils/split_scp.pl "${key_file}" ${split_scps}
+
+            # 2. Generate run.sh
+            log "Generate '${lm_stats_dir}/run.sh'. You can resume the process from stage 6 using this script"
+            mkdir -p "${lm_stats_dir}"; echo "${run_args} --stage 6 \"\$@\"; exit \$?" > "${lm_stats_dir}/run.sh"; chmod +x "${lm_stats_dir}/run.sh"
+
+            # 3. Submit jobs
+            log "LM collect-stats started... log: '${_logdir}/stats.*.log'"
+            # NOTE: --*_shape_file doesn't require length information if --batch_type=unsorted,
+            #       but it's used only for deciding the sample ids.
+            # shellcheck disable=SC2086
+            ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \
+                ${python} -m espnet2.bin.lm_train \
+                    --collect_stats true \
+                    --use_preprocessor true \
+                    --bpemodel "${tgt_bpemodel}" \
+                    --token_type "${lm_token_type}"\
+                    --token_list "${lm_token_list}" \
+                    --non_linguistic_symbols "${nlsyms_txt}" \
+                    --cleaner "${cleaner}" \
+                    --g2p "${g2p}" \
+                    --train_data_path_and_name_and_type "${data_feats}/lm_train.txt,text,text" \
+                    --valid_data_path_and_name_and_type "${lm_dev_text},text,text" \
+                    --train_shape_file "${_logdir}/train.JOB.scp" \
+                    --valid_shape_file "${_logdir}/dev.JOB.scp" \
+                    --output_dir "${_logdir}/stats.JOB" \
+                    ${_opts} ${lm_args} || { cat "${_logdir}"/stats.1.log; exit 1; }
+
+            # 4. Aggregate shape files
+            _opts=
+            for i in $(seq "${_nj}"); do
+                _opts+="--input_dir ${_logdir}/stats.${i} "
+            done
+            # shellcheck disable=SC2086
+            ${python} -m espnet2.bin.aggregate_stats_dirs ${_opts} --output_dir "${lm_stats_dir}"
+
+            # Append the num-tokens at the last dimensions. This is used for batch-bins count
+            <"${lm_stats_dir}/train/text_shape" \
+                awk -v N="$(<${lm_token_list} wc -l)" '{ print $0 "," N }' \
+                >"${lm_stats_dir}/train/text_shape.${lm_token_type}"
+
+            <"${lm_stats_dir}/valid/text_shape" \
+                awk -v N="$(<${lm_token_list} wc -l)" '{ print $0 "," N }' \
+                >"${lm_stats_dir}/valid/text_shape.${lm_token_type}"
+        fi
+
+
+        if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+            log "Stage 6: LM Training: train_set=${data_feats}/lm_train.txt, dev_set=${lm_dev_text}"
+
+            _opts=
+            if [ -n "${lm_config}" ]; then
+                # To generate the config file: e.g.
+                #   % python3 -m espnet2.bin.lm_train --print_config --optim adam
+                _opts+="--config ${lm_config} "
+            fi
+
+            if [ "${num_splits_lm}" -gt 1 ]; then
+                # If you met a memory error when parsing text files, this option may help you.
+                # The corpus is split into subsets and each subset is used for training one by one in order,
+                # so the memory footprint can be limited to the memory required for each dataset.
+
+                _split_dir="${lm_stats_dir}/splits${num_splits_lm}"
+                if [ ! -f "${_split_dir}/.done" ]; then
+                    rm -f "${_split_dir}/.done"
+                    ${python} -m espnet2.bin.split_scps \
+                      --scps "${data_feats}/lm_train.txt" "${lm_stats_dir}/train/text_shape.${lm_token_type}" \
+                      --num_splits "${num_splits_lm}" \
+                      --output_dir "${_split_dir}"
+                    touch "${_split_dir}/.done"
+                else
+                    log "${_split_dir}/.done exists. Spliting is skipped"
+                fi
+
+                _opts+="--train_data_path_and_name_and_type ${_split_dir}/lm_train.txt,text,text "
+                _opts+="--train_shape_file ${_split_dir}/text_shape.${lm_token_type} "
+                _opts+="--multiple_iterator true "
+
+            else
+                _opts+="--train_data_path_and_name_and_type ${data_feats}/lm_train.txt,text,text "
+                _opts+="--train_shape_file ${lm_stats_dir}/train/text_shape.${lm_token_type} "
+            fi
+
+            # NOTE(kamo): --fold_length is used only if --batch_type=folded and it's ignored in the other case
+
+            log "Generate '${lm_exp}/run.sh'. You can resume the process from stage 7 using this script"
+            mkdir -p "${lm_exp}"; echo "${run_args} --stage 7 \"\$@\"; exit \$?" > "${lm_exp}/run.sh"; chmod +x "${lm_exp}/run.sh"
+
+            log "LM training started... log: '${lm_exp}/train.log'"
+            if echo "${cuda_cmd}" | grep -e queue.pl -e queue-freegpu.pl &> /dev/null; then
+                # SGE can't include "/" in a job name
+                jobname="$(basename ${lm_exp})"
+            else
+                jobname="${lm_exp}/train.log"
+            fi
+
+            # shellcheck disable=SC2086
+            ${python} -m espnet2.bin.launch \
+                --cmd "${cuda_cmd} --name ${jobname}" \
+                --log "${lm_exp}"/train.log \
+                --ngpu "${ngpu}" \
+                --num_nodes "${num_nodes}" \
+                --init_file_prefix "${lm_exp}"/.dist_init_ \
+                --multiprocessing_distributed true -- \
+                ${python} -m espnet2.bin.lm_train \
+                    --ngpu "${ngpu}" \
+                    --use_preprocessor true \
+                    --bpemodel "${tgt_bpemodel}" \
+                    --token_type "${lm_token_type}"\
+                    --token_list "${lm_token_list}" \
+                    --non_linguistic_symbols "${nlsyms_txt}" \
+                    --cleaner "${cleaner}" \
+                    --g2p "${g2p}" \
+                    --valid_data_path_and_name_and_type "${lm_dev_text},text,text" \
+                    --valid_shape_file "${lm_stats_dir}/valid/text_shape.${lm_token_type}" \
+                    --fold_length "${lm_fold_length}" \
+                    --resume true \
+                    --output_dir "${lm_exp}" \
+                    ${_opts} ${lm_args}
+
+        fi
+
+
+        if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+            log "Stage 7: Calc perplexity: ${lm_test_text}"
+            _opts=
+            # TODO(kamo): Parallelize?
+            log "Perplexity calculation started... log: '${lm_exp}/perplexity_test/lm_calc_perplexity.log'"
+            # shellcheck disable=SC2086
+            ${cuda_cmd} --gpu "${ngpu}" "${lm_exp}"/perplexity_test/lm_calc_perplexity.log \
+                ${python} -m espnet2.bin.lm_calc_perplexity \
+                    --ngpu "${ngpu}" \
+                    --data_path_and_name_and_type "${lm_test_text},text,text" \
+                    --train_config "${lm_exp}"/config.yaml \
+                    --model_file "${lm_exp}/${inference_lm}" \
+                    --output_dir "${lm_exp}/perplexity_test" \
+                    ${_opts}
+            log "PPL: ${lm_test_text}: $(cat ${lm_exp}/perplexity_test/ppl)"
+
+        fi
+
+    else
+        log "Stage 5-7: Skip lm-related stages: use_lm=${use_lm}"
+    fi
+
+
+    if "${use_ngram}"; then
+        mkdir -p ${ngram_exp}
+    fi
+    if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
+        if "${use_ngram}"; then
+            log "Stage 8: Ngram Training: train_set=${data_feats}/lm_train.txt"
+            cut -f 2 -d " " ${data_feats}/lm_train.txt | lmplz -S "20%" --discount_fallback -o ${ngram_num} - >${ngram_exp}/${ngram_num}gram.arpa
+            build_binary -s ${ngram_exp}/${ngram_num}gram.arpa ${ngram_exp}/${ngram_num}gram.bin 
+        else
+            log "Stage 8: Skip ngram stages: use_ngram=${use_ngram}"
+        fi
+    fi
+
+
+    if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
+        _mt_train_dir="${data_feats}/${train_set}"
+        _mt_valid_dir="${data_feats}/${valid_set}"
+        log "Stage 9: MT collect stats: train_set=${_mt_train_dir}, valid_set=${_mt_valid_dir}"
+
+        _opts=
+        if [ -n "${mt_config}" ]; then
+            # To generate the config file: e.g.
+            #   % python3 -m espnet2.bin.mt_train --print_config --optim adam
+            _opts+="--config ${mt_config} "
+        fi
+
+        # 1. Split the key file
+        _logdir="${mt_stats_dir}/logdir"
+        mkdir -p "${_logdir}"
+
+        _scp=text.${src_case}.${src_lang}
+
+        # Get the minimum number among ${nj} and the number lines of input files
+        _nj=$(min "${nj}" "$(<${_mt_train_dir}/${_scp} wc -l)" "$(<${_mt_valid_dir}/${_scp} wc -l)")
+
+        key_file="${_mt_train_dir}/${_scp}"
+        split_scps=""
+        for n in $(seq "${_nj}"); do
+            split_scps+=" ${_logdir}/train.${n}.scp"
+        done
+        # shellcheck disable=SC2086
+        utils/split_scp.pl "${key_file}" ${split_scps}
+
+        key_file="${_mt_valid_dir}/${_scp}"
+        split_scps=""
+        for n in $(seq "${_nj}"); do
+            split_scps+=" ${_logdir}/valid.${n}.scp"
+        done
+        # shellcheck disable=SC2086
+        utils/split_scp.pl "${key_file}" ${split_scps}
+
+        # 2. Generate run.sh
+        log "Generate '${mt_stats_dir}/run.sh'. You can resume the process from stage 9 using this script"
+        mkdir -p "${mt_stats_dir}"; echo "${run_args} --stage 9 \"\$@\"; exit \$?" > "${mt_stats_dir}/run.sh"; chmod +x "${mt_stats_dir}/run.sh"
+
+        # 3. Submit jobs
+        log "MT collect-stats started... log: '${_logdir}/stats.*.log'"
+
+        # NOTE: --*_shape_file doesn't require length information if --batch_type=unsorted,
+        #       but it's used only for deciding the sample ids.
+
+        # TODO(jiatong): fix different bpe model
+        # shellcheck disable=SC2086
+        ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \
+            ${python} -m espnet2.bin.mt_train \
+                --collect_stats true \
+                --use_preprocessor true \
+                --bpemodel "${tgt_bpemodel}" \
+                --src_bpemodel "${src_bpemodel}" \
+                --token_type "${tgt_token_type}" \
+                --src_token_type "${src_token_type}" \
+                --token_list "${tgt_token_list}" \
+                --src_token_list "${src_token_list}" \
+                --non_linguistic_symbols "${nlsyms_txt}" \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --train_data_path_and_name_and_type "${_mt_train_dir}/text.${tgt_case}.${tgt_lang},text,text" \
+                --train_data_path_and_name_and_type "${_mt_train_dir}/text.${src_case}.${src_lang},src_text,text" \
+                --valid_data_path_and_name_and_type "${_mt_valid_dir}/text.${tgt_case}.${tgt_lang},text,text" \
+                --valid_data_path_and_name_and_type "${_mt_valid_dir}/text.${src_case}.${src_lang},src_text,text" \
+                --train_shape_file "${_logdir}/train.JOB.scp" \
+                --valid_shape_file "${_logdir}/valid.JOB.scp" \
+                --output_dir "${_logdir}/stats.JOB" \
+                ${_opts} ${mt_args} || { cat "${_logdir}"/stats.1.log; exit 1; }
+
+        # 4. Aggregate shape files
+        _opts=
+        for i in $(seq "${_nj}"); do
+            _opts+="--input_dir ${_logdir}/stats.${i} "
+        done
+        # shellcheck disable=SC2086
+        ${python} -m espnet2.bin.aggregate_stats_dirs ${_opts} --output_dir "${mt_stats_dir}"
+
+        # Append the num-tokens at the last dimensions. This is used for batch-bins count
+        <"${mt_stats_dir}/train/text_shape" \
+            awk -v N="$(<${tgt_token_list} wc -l)" '{ print $0 "," N }' \
+            >"${mt_stats_dir}/train/text_shape.${tgt_token_type}"
+
+        <"${mt_stats_dir}/train/src_text_shape" \
+            awk -v N="$(<${src_token_list} wc -l)" '{ print $0 "," N }' \
+            >"${mt_stats_dir}/train/src_text_shape.${src_token_type}"
+
+        <"${mt_stats_dir}/valid/text_shape" \
+            awk -v N="$(<${tgt_token_list} wc -l)" '{ print $0 "," N }' \
+            >"${mt_stats_dir}/valid/text_shape.${tgt_token_type}"
+
+        <"${mt_stats_dir}/valid/src_text_shape" \
+            awk -v N="$(<${src_token_list} wc -l)" '{ print $0 "," N }' \
+            >"${mt_stats_dir}/valid/src_text_shape.${src_token_type}"
+    fi
+
+
+    if [ ${stage} -le 10 ] && [ ${stop_stage} -ge 10 ]; then
+        _mt_train_dir="${data_feats}/${train_set}"
+        _mt_valid_dir="${data_feats}/${valid_set}"
+        log "Stage 10: MT Training: train_set=${_mt_train_dir}, valid_set=${_mt_valid_dir}"
+
+        _opts=
+        if [ -n "${mt_config}" ]; then
+            # To generate the config file: e.g.
+            #   % python3 -m espnet2.bin.mt_train --print_config --optim adam
+            _opts+="--config ${mt_config} "
+        fi
+
+        if [ "${num_splits_mt}" -gt 1 ]; then
+            # If you met a memory error when parsing text files, this option may help you.
+            # The corpus is split into subsets and each subset is used for training one by one in order,
+            # so the memory footprint can be limited to the memory required for each dataset.
+
+            _split_dir="${mt_stats_dir}/splits${num_splits_mt}"
+            if [ ! -f "${_split_dir}/.done" ]; then
+                rm -f "${_split_dir}/.done"
+                ${python} -m espnet2.bin.split_scps \
+                  --scps \
+                      "${_mt_train_dir}/${_scp}" \
+                      "${_mt_train_dir}/text.${tgt_case}.${tgt_lang}" \
+                      "${_mt_train_dir}/text.${src_case}.${src_lang}" \
+                      "${mt_stats_dir}/train/text_shape.${tgt_token_type}" \
+                      "${mt_stats_dir}/train/src_text_shape.${src_token_type}" \
+                  --num_splits "${num_splits_mt}" \
+                  --output_dir "${_split_dir}"
+                touch "${_split_dir}/.done"
+            else
+                log "${_split_dir}/.done exists. Spliting is skipped"
+            fi
+
+            _opts+="--train_data_path_and_name_and_type ${_split_dir}/text.${tgt_case}.${tgt_lang},text,text "
+            _opts+="--train_data_path_and_name_and_type ${_split_dir}/text.${src_case}.${src_lang},src_text,text "
+            _opts+="--train_shape_file ${_split_dir}/text_shape.${tgt_token_type} "
+            _opts+="--train_shape_file ${_split_dir}/src_text_shape.${src_token_type} "
+            _opts+="--multiple_iterator true "
+        else
+            _opts+="--train_data_path_and_name_and_type ${_mt_train_dir}/text.${tgt_case}.${tgt_lang},text,text "
+            _opts+="--train_data_path_and_name_and_type ${_mt_train_dir}/text.${src_case}.${src_lang},src_text,text "
+            _opts+="--train_shape_file ${mt_stats_dir}/train/text_shape.${tgt_token_type} "
+            _opts+="--train_shape_file ${mt_stats_dir}/train/src_text_shape.${src_token_type} "
+        fi
+
+        log "Generate '${mt_exp}/run.sh'. You can resume the process from stage 10 using this script"
+        mkdir -p "${mt_exp}"; echo "${run_args} --stage 10 \"\$@\"; exit \$?" > "${mt_exp}/run.sh"; chmod +x "${mt_exp}/run.sh"
+
+        # NOTE(kamo): --fold_length is used only if --batch_type=folded and it's ignored in the other case
+        log "MT training started... log: '${mt_exp}/train.log'"
+        if echo "${cuda_cmd}" | grep -e queue.pl -e queue-freegpu.pl &> /dev/null; then
+            # SGE can't include "/" in a job name
+            jobname="$(basename ${mt_exp})"
+        else
+            jobname="${mt_exp}/train.log"
+        fi
+
+        # TODO(jiatong): fix bpe
+        # shellcheck disable=SC2086
+        ${python} -m espnet2.bin.launch \
+            --cmd "${cuda_cmd} --name ${jobname}" \
+            --log "${mt_exp}"/train.log \
+            --ngpu "${ngpu}" \
+            --num_nodes "${num_nodes}" \
+            --init_file_prefix "${mt_exp}"/.dimt_init_ \
+            --multiprocessing_distributed true -- \
+            ${python} -m espnet2.bin.mt_train \
+                --use_preprocessor true \
+                --bpemodel "${tgt_bpemodel}" \
+                --token_type "${tgt_token_type}" \
+                --token_list "${tgt_token_list}" \
+                --src_bpemodel "${src_bpemodel}" \
+                --src_token_type "${src_token_type}" \
+                --src_token_list "${src_token_list}" \
+                --non_linguistic_symbols "${nlsyms_txt}" \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --valid_data_path_and_name_and_type "${_mt_valid_dir}/text.${tgt_case}.${tgt_lang},text,text" \
+                --valid_data_path_and_name_and_type "${_mt_valid_dir}/text.${src_case}.${src_lang},src_text,text" \
+                --valid_shape_file "${mt_stats_dir}/valid/text_shape.${tgt_token_type}" \
+                --valid_shape_file "${mt_stats_dir}/valid/src_text_shape.${src_token_type}" \
+                --resume true \
+                --ignore_init_mismatch ${ignore_init_mismatch} \
+                --fold_length "${mt_text_fold_length}" \
+                --fold_length "${mt_text_fold_length}" \
+                --output_dir "${mt_exp}" \
+                ${_opts} ${mt_args}
+
+    fi
+else
+    log "Skip the training stages"
+fi
+
+
+if [ -n "${download_model}" ]; then
+    log "Use ${download_model} for decoding and evaluation"
+    mt_exp="${expdir}/${download_model}"
+    mkdir -p "${mt_exp}"
+
+    # If the model already exists, you can skip downloading
+    espnet_model_zoo_download --unpack true "${download_model}" > "${mt_exp}/config.txt"
+
+    # Get the path of each file
+    _mt_model_file=$(<"${mt_exp}/config.txt" sed -e "s/.*'mt_model_file': '\([^']*\)'.*$/\1/")
+    _mt_train_config=$(<"${mt_exp}/config.txt" sed -e "s/.*'mt_train_config': '\([^']*\)'.*$/\1/")
+
+    # Create symbolic links
+    ln -sf "${_mt_model_file}" "${mt_exp}"
+    ln -sf "${_mt_train_config}" "${mt_exp}"
+    inference_mt_model=$(basename "${_mt_model_file}")
+
+    if [ "$(<${mt_exp}/config.txt grep -c lm_file)" -gt 0 ]; then
+        _lm_file=$(<"${mt_exp}/config.txt" sed -e "s/.*'lm_file': '\([^']*\)'.*$/\1/")
+        _lm_train_config=$(<"${mt_exp}/config.txt" sed -e "s/.*'lm_train_config': '\([^']*\)'.*$/\1/")
+
+        lm_exp="${expdir}/${download_model}/lm"
+        mkdir -p "${lm_exp}"
+
+        ln -sf "${_lm_file}" "${lm_exp}"
+        ln -sf "${_lm_train_config}" "${lm_exp}"
+        inference_lm=$(basename "${_lm_file}")
+    fi
+
+fi
+
+
+if ! "${skip_eval}"; then
+    if [ ${stage} -le 11 ] && [ ${stop_stage} -ge 11 ]; then
+        log "Stage 11: Decoding: training_dir=${mt_exp}"
+
+        if ${gpu_inference}; then
+            _cmd="${cuda_cmd}"
+            _ngpu=1
+        else
+            _cmd="${decode_cmd}"
+            _ngpu=0
+        fi
+
+        _opts=
+        if [ -n "${inference_config}" ]; then
+            _opts+="--config ${inference_config} "
+        fi
+        if "${use_lm}"; then
+            if "${use_word_lm}"; then
+                _opts+="--word_lm_train_config ${lm_exp}/config.yaml "
+                _opts+="--word_lm_file ${lm_exp}/${inference_lm} "
+            else
+                _opts+="--lm_train_config ${lm_exp}/config.yaml "
+                _opts+="--lm_file ${lm_exp}/${inference_lm} "
+            fi
+        fi
+        if "${use_ngram}"; then
+             _opts+="--ngram_file ${ngram_exp}/${inference_ngram}"
+        fi
+
+        # 2. Generate run.sh
+        log "Generate '${mt_exp}/${inference_tag}/run.sh'. You can resume the process from stage 11 using this script"
+        mkdir -p "${mt_exp}/${inference_tag}"; echo "${run_args} --stage 11 \"\$@\"; exit \$?" > "${mt_exp}/${inference_tag}/run.sh"; chmod +x "${mt_exp}/${inference_tag}/run.sh"
+
+        for dset in ${test_sets}; do
+            _data="${data_feats}/${dset}"
+            _dir="${mt_exp}/${inference_tag}/${dset}"
+            _logdir="${_dir}/logdir"
+            mkdir -p "${_logdir}"
+
+            _scp=text.${src_case}.${src_lang}
+
+            # 1. Split the key file
+            key_file=${_data}/${_scp}
+            split_scps=""
+            _nj=$(min "${inference_nj}" "$(<${key_file} wc -l)")
+            mt_inference_tool="espnet2.bin.mt_inference"
+
+            for n in $(seq "${_nj}"); do
+                split_scps+=" ${_logdir}/keys.${n}.scp"
+            done
+            # shellcheck disable=SC2086
+            utils/split_scp.pl "${key_file}" ${split_scps}
+
+            # 2. Submit decoding jobs
+            log "Decoding started... log: '${_logdir}/mt_inference.*.log'"
+            # shellcheck disable=SC2086
+            ${_cmd} --gpu "${_ngpu}" JOB=1:"${_nj}" "${_logdir}"/mt_inference.JOB.log \
+                ${python} -m ${mt_inference_tool} \
+                    --batch_size ${batch_size} \
+                    --ngpu "${_ngpu}" \
+                    --data_path_and_name_and_type "${_data}/${_scp},src_text,text" \
+                    --key_file "${_logdir}"/keys.JOB.scp \
+                    --mt_train_config "${mt_exp}"/config.yaml \
+                    --mt_model_file "${mt_exp}"/"${inference_mt_model}" \
+                    --output_dir "${_logdir}"/output.JOB \
+                    ${_opts} ${inference_args}
+
+            # 3. Concatenates the output files from each jobs
+            for f in token token_int score text; do
+                for i in $(seq "${_nj}"); do
+                    cat "${_logdir}/output.${i}/1best_recog/${f}"
+                done | LC_ALL=C sort -k1 >"${_dir}/${f}"
+            done
+        done
+    fi
+
+    if [ ${stage} -le 12 ] && [ ${stop_stage} -ge 12 ]; then
+        log "Stage 12: Scoring"
+
+        for dset in ${test_sets}; do
+            _data="${data_feats}/${dset}"
+            _dir="${mt_exp}/${inference_tag}/${dset}"
+
+            # TODO(jiatong): add asr scoring and inference
+
+            _scoredir="${_dir}/score_bleu"
+            mkdir -p "${_scoredir}"
+
+            <"${_data}/text.${tgt_case}.${tgt_lang}" \
+                ${python} -m espnet2.bin.tokenize_text  \
+                    -f 2- --input - --output - \
+                    --token_type word \
+                    --non_linguistic_symbols "${nlsyms_txt}" \
+                    --remove_non_linguistic_symbols true \
+                    --cleaner "${cleaner}" \
+            >"${_scoredir}/ref.trn"
+
+            #paste \
+            #    <(<"${_data}/text.${tgt_case}.${tgt_lang}" \
+            #        ${python} -m espnet2.bin.tokenize_text  \
+            #            -f 2- --input - --output - \
+            #            --token_type word \
+            #            --non_linguistic_symbols "${nlsyms_txt}" \
+            #            --remove_non_linguistic_symbols true \
+            #            --cleaner "${cleaner}" \
+            #            ) \
+            #    <(<"${_data}/text.${tgt_case}.${tgt_lang}" awk '{ print "(" $2 "-" $1 ")" }') \
+            #        >"${_scoredir}/ref.trn.org"
+
+            # NOTE(kamo): Don't use cleaner for hyp
+            <"${_dir}/text"  \
+                    ${python} -m espnet2.bin.tokenize_text  \
+                    -f 2- --input - --output - \
+                    --token_type word \
+                    --non_linguistic_symbols "${nlsyms_txt}" \
+                    --remove_non_linguistic_symbols true \
+            >"${_scoredir}/hyp.trn"
+
+            #paste \
+            #    <(<"${_dir}/text"  \
+            #            ${python} -m espnet2.bin.tokenize_text  \
+            #                -f 2- --input - --output - \
+            #                --token_type word \
+            #                --non_linguistic_symbols "${nlsyms_txt}" \
+            #                --remove_non_linguistic_symbols true \
+            #                ) \
+            #    <(<"${_data}/text.${tgt_case}.${tgt_lang}" awk '{ print "(" $2 "-" $1 ")" }') \
+            #        >"${_scoredir}/hyp.trn.org"
+            
+            # remove utterance id
+            #perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/ref.trn.org" > "${_scoredir}/ref.trn"
+            #perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/hyp.trn.org" > "${_scoredir}/hyp.trn"
+
+            # detokenizer
+            detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/ref.trn" > "${_scoredir}/ref.trn.detok"
+            detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/hyp.trn" > "${_scoredir}/hyp.trn.detok"
+
+            if [ ${tgt_case} = "tc" ]; then
+                echo "Case sensitive BLEU result (single-reference)" > ${_scoredir}/result.tc.txt
+                sacrebleu "${_scoredir}/ref.trn.detok" \
+                          -i "${_scoredir}/hyp.trn.detok" \
+                          -m bleu chrf ter \
+                          >> ${_scoredir}/result.tc.txt
+                
+                log "Write a case-sensitive BLEU (single-reference) result in ${_scoredir}/result.tc.txt"
+            fi
+
+            # detokenize & remove punctuation except apostrophe
+            remove_punctuation.pl < "${_scoredir}/ref.trn.detok" > "${_scoredir}/ref.trn.detok.lc.rm"
+            remove_punctuation.pl < "${_scoredir}/hyp.trn.detok" > "${_scoredir}/hyp.trn.detok.lc.rm"
+            echo "Case insensitive BLEU result (single-reference)" > ${_scoredir}/result.lc.txt
+            sacrebleu -lc "${_scoredir}/ref.trn.detok.lc.rm" \
+                      -i "${_scoredir}/hyp.trn.detok.lc.rm" \
+                      -m bleu chrf ter \
+                      >> ${_scoredir}/result.lc.txt
+            log "Write a case-insensitve BLEU (single-reference) result in ${_scoredir}/result.lc.txt"
+
+            # process multi-references cases
+            multi_references=$(ls "${_data}/text.${tgt_case}.${tgt_lang}".* || echo "")
+            if [ "${multi_references}" != "" ]; then
+                case_sensitive_refs=""
+                case_insensitive_refs=""
+                for multi_reference in ${multi_references}; do
+                    ref_idx="${multi_reference##*.}"
+                    paste \
+                        <(<${multi_reference} \
+                            ${python} -m espnet2.bin.tokenize_text  \
+                                -f 2- --input - --output - \
+                                --token_type word \
+                                --non_linguistic_symbols "${nlsyms_txt}" \
+                                --remove_non_linguistic_symbols true \
+                                --cleaner "${cleaner}" \
+                                ) \
+                        <(<"${_data}/text.${tgt_case}.${tgt_lang}" awk '{ print "(" $2 "-" $1 ")" }') \
+                            >"${_scoredir}/ref.trn.org.${ref_idx}"
+                    
+                    # 
+                    perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/ref.trn.org.${ref_idx}" > "${_scoredir}/ref.trn.${ref_idx}"
+                    detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/ref.trn.${ref_idx}" > "${_scoredir}/ref.trn.detok.${ref_idx}"
+                    remove_punctuation.pl < "${_scoredir}/ref.trn.detok.${ref_idx}" > "${_scoredir}/ref.trn.detok.lc.rm.${ref_idx}"
+                    case_sensitive_refs="${case_sensitive_refs} ${_scoredir}/ref.trn.detok.${ref_idx}"
+                    case_insensitive_refs="${case_insensitive_refs} ${_scoredir}/ref.trn.detok.lc.rm.${ref_idx}"
+                done
+
+                if [ ${tgt_case} = "tc" ]; then
+                    echo "Case sensitive BLEU result (multi-references)" >> ${_scoredir}/result.tc.txt
+                    sacrebleu ${case_sensitive_refs} \
+                        -i ${_scoredir}/hyp.trn.detok.lc.rm -m bleu chrf ter \
+                        >> ${_scoredir}/result.tc.txt
+                    log "Write a case-sensitve BLEU (multi-reference) result in ${_scoredir}/result.tc.txt"
+                fi
+
+                echo "Case insensitive BLEU result (multi-references)" >> ${_scoredir}/result.lc.txt
+                sacrebleu -lc ${case_insensitive_refs} \
+                    -i ${_scoredir}/hyp.trn.detok.lc.rm -m bleu chrf ter \
+                    >> ${_scoredir}/result.lc.txt
+                log "Write a case-insensitve BLEU (multi-reference) result in ${_scoredir}/result.lc.txt"
+            fi
+        done
+
+        # Show results in Markdown syntax
+        scripts/utils/show_translation_result.sh --case $tgt_case "${mt_exp}" > "${mt_exp}"/RESULTS.md
+        cat "${mt_exp}"/RESULTS.md
+    fi
+else
+    log "Skip the evaluation stages"
+fi
+
+
+packed_model="${mt_exp}/${mt_exp##*/}_${inference_mt_model%.*}.zip"
+if ! "${skip_upload}"; then
+    if [ ${stage} -le 13 ] && [ ${stop_stage} -ge 13 ]; then
+        log "Stage 13: Pack model: ${packed_model}"
+
+        _opts=
+        if "${use_lm}"; then
+            _opts+="--lm_train_config ${lm_exp}/config.yaml "
+            _opts+="--lm_file ${lm_exp}/${inference_lm} "
+            _opts+="--option ${lm_exp}/perplexity_test/ppl "
+            _opts+="--option ${lm_exp}/images "
+        fi
+        if [ "${tgt_token_type}" = bpe ]; then
+            _opts+="--option ${tgt_bpemodel} "
+            _opts+="--option ${src_bpemodel} "
+        fi
+        if [ "${nlsyms_txt}" != none ]; then
+            _opts+="--option ${nlsyms_txt} "
+        fi
+        # shellcheck disable=SC2086
+        ${python} -m espnet2.bin.pack mt \
+            --mt_train_config "${mt_exp}"/config.yaml \
+            --mt_model_file "${mt_exp}"/"${inference_mt_model}" \
+            ${_opts} \
+            --option "${mt_exp}"/RESULTS.md \
+            --option "${mt_exp}"/RESULTS.md \
+            --option "${mt_exp}"/images \
+            --outpath "${packed_model}"
+    fi
+
+
+    if [ ${stage} -le 14 ] && [ ${stop_stage} -ge 14 ]; then
+        log "Stage 14: Upload model to Zenodo: ${packed_model}"
+
+        # To upload your model, you need to do:
+        #   1. Sign up to Zenodo: https://zenodo.org/
+        #   2. Create access token: https://zenodo.org/account/settings/applications/tokens/new/
+        #   3. Set your environment: % export ACCESS_TOKEN="<your token>"
+
+        if command -v git &> /dev/null; then
+            _creator_name="$(git config user.name)"
+            _checkout="
+git checkout $(git show -s --format=%H)"
+
+        else
+            _creator_name="$(whoami)"
+            _checkout=""
+        fi
+        # /some/where/espnet/egs2/foo/st1/ -> foo/st1
+        _task="$(pwd | rev | cut -d/ -f2 | rev)"
+        # foo/st1 -> foo
+        _corpus="${_task%/*}"
+        _model_name="${_creator_name}/${_corpus}_$(basename ${packed_model} .zip)"
+
+        # Generate description file
+        cat << EOF > "${mt_exp}"/description
+This model was trained by ${_creator_name} using ${_task} recipe in <a href="https://github.com/espnet/espnet/">espnet</a>.
+<p>&nbsp;</p>
+<ul>
+<li><strong>Python API</strong><pre><code class="language-python">See https://github.com/espnet/espnet_model_zoo</code></pre></li>
+<li><strong>Evaluate in the recipe</strong><pre>
+<code class="language-bash">git clone https://github.com/espnet/espnet
+cd espnet${_checkout}
+pip install -e .
+cd $(pwd | rev | cut -d/ -f1-3 | rev)
+./run.sh --skip_data_prep false --skip_train true --download_model ${_model_name}</code>
+</pre></li>
+<li><strong>Results</strong><pre><code>$(cat "${mt_exp}"/RESULTS.md)</code></pre></li>
+<li><strong>MT config</strong><pre><code>$(cat "${mt_exp}"/config.yaml)</code></pre></li>
+<li><strong>LM config</strong><pre><code>$(if ${use_lm}; then cat "${lm_exp}"/config.yaml; else echo NONE; fi)</code></pre></li>
+</ul>
+EOF
+
+        # NOTE(kamo): The model file is uploaded here, but not published yet.
+        #   Please confirm your record at Zenodo and publish it by yourself.
+
+        # shellcheck disable=SC2086
+        espnet_model_zoo_upload \
+            --file "${packed_model}" \
+            --title "ESPnet2 pretrained model, ${_model_name}, fs=${fs}, lang=${lang}" \
+            --description_file "${mt_exp}"/description \
+            --creator_name "${_creator_name}" \
+            --license "CC-BY-4.0" \
+            --use_sandbox false \
+            --publish false
+    fi
+else
+    log "Skip the uploading stages"
+fi
+
+if ! "${skip_upload_hf}"; then
+    if [ ${stage} -le 15 ] && [ ${stop_stage} -ge 15 ]; then
+        [ -z "${hf_repo}" ] && \
+            log "ERROR: You need to setup the variable hf_repo with the name of the repository located at HuggingFace" && \
+            exit 1
+        log "Stage 15: Upload model to HuggingFace: ${hf_repo}"
+
+        gitlfs=$(git lfs --version 2> /dev/null || true)
+        [ -z "${gitlfs}" ] && \
+            log "ERROR: You need to install git-lfs first" && \
+            exit 1             
+  
+        dir_repo=${expdir}/hf_${hf_repo//"/"/"_"}
+        [ ! -d "${dir_repo}" ] && git clone https://huggingface.co/${hf_repo} ${dir_repo}
+  
+        if command -v git &> /dev/null; then
+            _creator_name="$(git config user.name)"
+            _checkout="git checkout $(git show -s --format=%H)"
+        else
+            _creator_name="$(whoami)"
+            _checkout=""
+        fi
+        # /some/where/espnet/egs2/foo/asr1/ -> foo/asr1
+        _task="$(pwd | rev | cut -d/ -f2 | rev)"
+        # foo/asr1 -> foo
+        _corpus="${_task%/*}"
+        _model_name="${_creator_name}/${_corpus}_$(basename ${packed_model} .zip)"
+  
+        # copy files in ${dir_repo}
+        unzip -o ${packed_model} -d ${dir_repo}
+        # Generate description file
+        # shellcheck disable=SC2034
+        hf_task=machine-translation
+        # shellcheck disable=SC2034     
+        espnet_task=MT
+        # shellcheck disable=SC2034
+        task_exp=${mt_exp}
+        eval "echo \"$(cat scripts/utils/TEMPLATE_HF_Readme.md)\"" > "${dir_repo}"/README.md
+
+        this_folder=${PWD}
+        cd ${dir_repo}
+        if [ -n "$(git status --porcelain)" ]; then
+            git add .
+            git commit -m "Update model"
+        fi
+        git push
+        cd ${this_folder}
+    fi
+else
+    log "Skip the uploading to HuggingFace stage"
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/TEMPLATE/mt1/path.sh b/egs2/TEMPLATE/mt1/path.sh
new file mode 100755
index 00000000000..d2b90a67653
--- /dev/null
+++ b/egs2/TEMPLATE/mt1/path.sh
@@ -0,0 +1,22 @@
+MAIN_ROOT=$PWD/../../..
+KALDI_ROOT=$MAIN_ROOT/tools/kaldi
+
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+
+. "${MAIN_ROOT}"/tools/activate_python.sh && . "${MAIN_ROOT}"/tools/extra_path.sh
+export PATH=$MAIN_ROOT/utils:$MAIN_ROOT/espnet/bin:$PATH
+
+export OMP_NUM_THREADS=1
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+
+# You need to change or unset NCCL_SOCKET_IFNAME according to your network environment
+# https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/env.html#nccl-socket-ifname
+export NCCL_SOCKET_IFNAME="^lo,docker,virbr,vmnet,vboxnet"
+
+# NOTE(kamo): Source at the last to overwrite the setting
+. local/path.sh
diff --git a/egs2/TEMPLATE/mt1/pyscripts b/egs2/TEMPLATE/mt1/pyscripts
new file mode 120000
index 00000000000..90e7cf60b04
--- /dev/null
+++ b/egs2/TEMPLATE/mt1/pyscripts
@@ -0,0 +1 @@
+../asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/TEMPLATE/mt1/scripts b/egs2/TEMPLATE/mt1/scripts
new file mode 120000
index 00000000000..1000492f630
--- /dev/null
+++ b/egs2/TEMPLATE/mt1/scripts
@@ -0,0 +1 @@
+../asr1/scripts
\ No newline at end of file
diff --git a/egs2/TEMPLATE/mt1/setup.sh b/egs2/TEMPLATE/mt1/setup.sh
new file mode 100755
index 00000000000..bcb0fa0916b
--- /dev/null
+++ b/egs2/TEMPLATE/mt1/setup.sh
@@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+help_message=$(cat << EOF
+Usage: $0 <target-dir>
+EOF
+)
+
+
+if [ $# -ne 1 ]; then
+    log "${help_message}"
+    log "Error: 1 positional argument is required."
+    exit 2
+fi
+
+
+dir=$1
+mkdir -p "${dir}"
+
+if [ ! -d "${dir}"/../../TEMPLATE ]; then
+    log "Error: ${dir}/../../TEMPLATE should exist. You may specify wrong directory."
+    exit 1
+fi
+
+targets=""
+
+# Copy
+for f in cmd.sh conf local; do
+    target="${dir}"/../../TEMPLATE/mt1/"${f}"
+    cp -r "${target}" "${dir}"
+    targets+="${dir}/${target} "
+done
+
+
+# Symlinks to TEMPLATE
+for f in mt.sh path.sh db.sh scripts pyscripts; do
+    target=../../TEMPLATE/mt1/"${f}"
+    ln -sf "${target}" "${dir}"
+    targets+="${dir}/${target} "
+done
+
+
+# Symlinks to Kaldi
+for f in steps utils; do
+    target=../../../tools/kaldi/egs/wsj/s5/"${f}"
+    ln -sf "${target}" "${dir}"
+    targets+="${dir}/${target} "
+done
+
+log "Created: ${targets}"
diff --git a/egs2/TEMPLATE/mt1/steps b/egs2/TEMPLATE/mt1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/TEMPLATE/mt1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/TEMPLATE/mt1/utils b/egs2/TEMPLATE/mt1/utils
new file mode 120000
index 00000000000..6d93948f170
--- /dev/null
+++ b/egs2/TEMPLATE/mt1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils/
\ No newline at end of file
diff --git a/egs2/TEMPLATE/ssl1/pyscripts/dump_km_label.py b/egs2/TEMPLATE/ssl1/pyscripts/dump_km_label.py
index 2013fb739da..552c84f89ad 100644
--- a/egs2/TEMPLATE/ssl1/pyscripts/dump_km_label.py
+++ b/egs2/TEMPLATE/ssl1/pyscripts/dump_km_label.py
@@ -10,7 +10,7 @@
 import tqdm
 import pdb
 
-from sklearn_km import (MfccFeatureReader, get_path_iterator, HubertFeatureReader)
+from sklearn_km import MfccFeatureReader, get_path_iterator, HubertFeatureReader
 
 logging.basicConfig(
     level=logging.DEBUG,
@@ -23,8 +23,9 @@ def get_parser():
     parser = argparse.ArgumentParser()
     parser.add_argument("--km-path", type=str)
     parser.add_argument("--label-path", type=str)
-    parser.add_argument("--recog-set", default=None,
-                        nargs='+', help='folders contain wav.scp for recog')
+    parser.add_argument(
+        "--recog-set", default=None, nargs="+", help="folders contain wav.scp for recog"
+    )
     parser.add_argument("--feature", default="mfcc", type=str)
     parser.add_argument("--nj", default=1, type=int)
     parser.add_argument("--sample-rate", type=int, default=16000)
@@ -38,15 +39,13 @@ class ApplyKmeans(object):
     def __init__(self, km_path):
         self.km_model = joblib.load(km_path)
         self.nc = self.km_model.cluster_centers_.transpose()
-        self.nc_norm = (self.nc ** 2).sum(0, keepdims=True)
+        self.nc_norm = (self.nc**2).sum(0, keepdims=True)
 
     def __call__(self, x):
         if isinstance(x, torch.Tensor):
             x = x.cpu().numpy()
         probs = (
-            (x ** 2).sum(1, keepdims=True)
-            - 2 * np.matmul(x, self.nc)
-            + self.nc_norm
+            (x**2).sum(1, keepdims=True) - 2 * np.matmul(x, self.nc) + self.nc_norm
         )
         return np.argmin(probs, axis=1)
 
@@ -60,13 +59,12 @@ def dump_pseudo_label_mfcc(km_path, task, sample_rate, nj):
     if nj > 1:
         feats = joblib.Parallel(n_jobs=nj)(
             joblib.delayed(reader.get_feats)(path)
-                           for utt_id, path in
-                           tqdm.tqdm(iterator, total=num))
+            for utt_id, path in tqdm.tqdm(iterator, total=num)
+        )
 
         p_labs = joblib.Parallel(n_jobs=nj)(
-            joblib.delayed(apply_kmeans)(feat)
-            for feat in
-            tqdm.tqdm(feats, total=num))
+            joblib.delayed(apply_kmeans)(feat) for feat in tqdm.tqdm(feats, total=num)
+        )
         iterator = generator()
         utt_ids = [utt_id for utt_id, _ in iterator]
     else:
@@ -120,7 +118,7 @@ def dump_label(km_path, label_path, recog_set, feature, nj, sample_rate, hurl, h
             with open(label_path, "w") as f:
                 for utt_id, p_lab in zip(utt_ids, p_labs):
                     f.write(utt_id + " " + " ".join(map(str, p_lab)) + "\n")
-            
+
     logger.info("finished successfully")
 
 
diff --git a/egs2/TEMPLATE/ssl1/pyscripts/feature_loader.py b/egs2/TEMPLATE/ssl1/pyscripts/feature_loader.py
index b2fe3d13001..b0dae8a2074 100644
--- a/egs2/TEMPLATE/ssl1/pyscripts/feature_loader.py
+++ b/egs2/TEMPLATE/ssl1/pyscripts/feature_loader.py
@@ -43,7 +43,9 @@ def get_feats(self, path):
             x = torch.from_numpy(x).view(1, -1).float()
 
             mfcc = torchaudio.compliance.kaldi.mfcc(
-                waveform=x, sample_frequency=self.fs, use_energy=False,
+                waveform=x,
+                sample_frequency=self.fs,
+                use_energy=False,
             ).transpose(
                 0, 1
             )  # (freq, time)
@@ -54,16 +56,17 @@ def get_feats(self, path):
             )
             return concat
 
-        
+
 class HubertFeatureReader(object):
     def __init__(self, fs, hubert_url, hubert_dir_path, layer, max_chunk=1600000):
         self.fs = fs
-        
+
         self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
         from espnet2.asr.encoder.hubert_encoder import FairseqHubertEncoder
+
         e = FairseqHubertEncoder(0, hubert_url, hubert_dir_path)
         self.model = e.encoders.to(self.device).eval()
-        
+
         self.layer = layer
         self.max_chunk = max_chunk
         logger.info(f" max_chunk = {self.max_chunk}")
diff --git a/egs2/TEMPLATE/ssl1/pyscripts/sklearn_km.py b/egs2/TEMPLATE/ssl1/pyscripts/sklearn_km.py
index 07846b19965..ce0c82fcd3c 100644
--- a/egs2/TEMPLATE/ssl1/pyscripts/sklearn_km.py
+++ b/egs2/TEMPLATE/ssl1/pyscripts/sklearn_km.py
@@ -39,18 +39,21 @@
 
 def get_parser():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--feats-dir", type=str,
-                        help="folder contains wav.scp for training")
-    parser.add_argument("--n-clusters", default=100, type=int,
-                        help="number of clusters for K-Means")
+    parser.add_argument(
+        "--feats-dir", type=str, help="folder contains wav.scp for training"
+    )
+    parser.add_argument(
+        "--n-clusters", default=100, type=int, help="number of clusters for K-Means"
+    )
     parser.add_argument("--nj", default=1, type=int, help="only support mfcc")
     parser.add_argument("--seed", default=0, type=int)
     parser.add_argument("--fs", type=int, default=16000)
     parser.add_argument("--feature-type", type=str, default="mfcc")
     parser.add_argument("--hubert-model-url", type=str, default=None)
     parser.add_argument("--hubert-model-path", type=str, default=None)
-    parser.add_argument("--portion", type=float, default=1.0,
-                        help="Using a subset of the data.")
+    parser.add_argument(
+        "--portion", type=float, default=1.0, help="Using a subset of the data."
+    )
 
     group = parser.add_argument_group(description="K-means model.")
     group.add_argument("--km-path", type=str, help="path for k-means model.")
@@ -68,7 +71,7 @@ def get_parser():
 def get_path_iterator(wav, portion=0.1):
     with open(wav, "r") as f:
         lines = [line.rstrip() for line in f]
-        lines = sample(lines, int(portion*len(lines)))
+        lines = sample(lines, int(portion * len(lines)))
 
         def iterate():
             for line in lines:
@@ -86,8 +89,7 @@ def get_mfcc_feature(feats_dir, fs, nj, portion):
 
     if nj > 1:
         feats = joblib.Parallel(n_jobs=nj)(
-            joblib.delayed(
-                reader.get_feats)(path)
+            joblib.delayed(reader.get_feats)(path)
             for utt_id, path in tqdm.tqdm(iterator, total=num)
         )
     else:
@@ -115,8 +117,13 @@ def get_hubert_feature(feats_dir, fs, portion, url, dir, layer):
 
 
 def load_feature(
-    feats_dir, fs, nj, portion, feature_type,
-    hubert_model_url, hubert_model_path,
+    feats_dir,
+    fs,
+    nj,
+    portion,
+    feature_type,
+    hubert_model_url,
+    hubert_model_path,
 ):
     # generate mfcc feature
     if feature_type == "mfcc":
@@ -124,9 +131,7 @@ def load_feature(
     elif "hubert" in feature_type:
         hlayer = int(feature_type.replace("hubert", ""))
         feat = get_hubert_feature(
-            feats_dir, fs, portion,
-            hubert_model_url, hubert_model_path,
-            hlayer
+            feats_dir, fs, portion, hubert_model_url, hubert_model_path, hlayer
         )
     else:
         raise ValueError(f"feature_type: {feature_type}")
diff --git a/egs2/TEMPLATE/st1/cmd.sh b/egs2/TEMPLATE/st1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/TEMPLATE/st1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/TEMPLATE/st1/conf/fbank.conf b/egs2/TEMPLATE/st1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/TEMPLATE/st1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/TEMPLATE/st1/conf/pbs.conf b/egs2/TEMPLATE/st1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/TEMPLATE/st1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/TEMPLATE/st1/conf/pitch.conf b/egs2/TEMPLATE/st1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/TEMPLATE/st1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/TEMPLATE/st1/conf/queue.conf b/egs2/TEMPLATE/st1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/TEMPLATE/st1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/TEMPLATE/st1/conf/slurm.conf b/egs2/TEMPLATE/st1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/TEMPLATE/st1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/TEMPLATE/st1/db.sh b/egs2/TEMPLATE/st1/db.sh
new file mode 120000
index 00000000000..318d781d123
--- /dev/null
+++ b/egs2/TEMPLATE/st1/db.sh
@@ -0,0 +1 @@
+../asr1/db.sh
\ No newline at end of file
diff --git a/egs2/TEMPLATE/st1/local/path.sh b/egs2/TEMPLATE/st1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/TEMPLATE/st1/path.sh b/egs2/TEMPLATE/st1/path.sh
new file mode 100755
index 00000000000..d2b90a67653
--- /dev/null
+++ b/egs2/TEMPLATE/st1/path.sh
@@ -0,0 +1,22 @@
+MAIN_ROOT=$PWD/../../..
+KALDI_ROOT=$MAIN_ROOT/tools/kaldi
+
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+
+. "${MAIN_ROOT}"/tools/activate_python.sh && . "${MAIN_ROOT}"/tools/extra_path.sh
+export PATH=$MAIN_ROOT/utils:$MAIN_ROOT/espnet/bin:$PATH
+
+export OMP_NUM_THREADS=1
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+
+# You need to change or unset NCCL_SOCKET_IFNAME according to your network environment
+# https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/env.html#nccl-socket-ifname
+export NCCL_SOCKET_IFNAME="^lo,docker,virbr,vmnet,vboxnet"
+
+# NOTE(kamo): Source at the last to overwrite the setting
+. local/path.sh
diff --git a/egs2/TEMPLATE/st1/pyscripts b/egs2/TEMPLATE/st1/pyscripts
new file mode 120000
index 00000000000..90e7cf60b04
--- /dev/null
+++ b/egs2/TEMPLATE/st1/pyscripts
@@ -0,0 +1 @@
+../asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/TEMPLATE/st1/scripts b/egs2/TEMPLATE/st1/scripts
new file mode 120000
index 00000000000..1000492f630
--- /dev/null
+++ b/egs2/TEMPLATE/st1/scripts
@@ -0,0 +1 @@
+../asr1/scripts
\ No newline at end of file
diff --git a/egs2/TEMPLATE/st1/setup.sh b/egs2/TEMPLATE/st1/setup.sh
new file mode 100755
index 00000000000..4cacb253e2e
--- /dev/null
+++ b/egs2/TEMPLATE/st1/setup.sh
@@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+help_message=$(cat << EOF
+Usage: $0 <target-dir>
+EOF
+)
+
+
+if [ $# -ne 1 ]; then
+    log "${help_message}"
+    log "Error: 1 positional argument is required."
+    exit 2
+fi
+
+
+dir=$1
+mkdir -p "${dir}"
+
+if [ ! -d "${dir}"/../../TEMPLATE ]; then
+    log "Error: ${dir}/../../TEMPLATE should exist. You may specify wrong directory."
+    exit 1
+fi
+
+targets=""
+
+# Copy
+for f in cmd.sh conf local; do
+    target="${dir}"/../../TEMPLATE/st1/"${f}"
+    cp -r "${target}" "${dir}"
+    targets+="${dir}/${target} "
+done
+
+
+# Symlinks to TEMPLATE
+for f in st.sh path.sh db.sh scripts pyscripts; do
+    target=../../TEMPLATE/st1/"${f}"
+    ln -sf "${target}" "${dir}"
+    targets+="${dir}/${target} "
+done
+
+
+# Symlinks to Kaldi
+for f in steps utils; do
+    target=../../../tools/kaldi/egs/wsj/s5/"${f}"
+    ln -sf "${target}" "${dir}"
+    targets+="${dir}/${target} "
+done
+
+log "Created: ${targets}"
diff --git a/egs2/TEMPLATE/st1/st.sh b/egs2/TEMPLATE/st1/st.sh
new file mode 100755
index 00000000000..18303210f87
--- /dev/null
+++ b/egs2/TEMPLATE/st1/st.sh
@@ -0,0 +1,1712 @@
+#!/usr/bin/env bash
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+min() {
+  local a b
+  a=$1
+  for b in "$@"; do
+      if [ "${b}" -le "${a}" ]; then
+          a="${b}"
+      fi
+  done
+  echo "${a}"
+}
+SECONDS=0
+
+# General configuration
+stage=1              # Processes starts from the specified stage.
+stop_stage=10000     # Processes is stopped at the specified stage.
+skip_data_prep=false # Skip data preparation stages.
+skip_train=false     # Skip training stages.
+skip_eval=false      # Skip decoding and evaluation stages.
+skip_upload=true     # Skip packing and uploading stages.
+skip_upload_hf=true  # Skip uploading to hugging face stages.
+ngpu=1               # The number of gpus ("0" uses cpu, otherwise use gpu).
+num_nodes=1          # The number of nodes.
+nj=32                # The number of parallel jobs.
+inference_nj=32      # The number of parallel jobs in decoding.
+gpu_inference=false  # Whether to perform gpu decoding.
+dumpdir=dump         # Directory to dump features.
+expdir=exp           # Directory to save experiments.
+python=python3       # Specify python to execute espnet commands.
+
+# Data preparation related
+local_data_opts= # The options given to local/data.sh.
+
+# Speed perturbation related
+speed_perturb_factors=  # perturbation factors, e.g. "0.9 1.0 1.1" (separated by space).
+
+# Feature extraction related
+feats_type=raw       # Feature type (raw or fbank_pitch).
+audio_format=flac    # Audio format: wav, flac, wav.ark, flac.ark  (only in feats_type=raw).
+fs=16k               # Sampling rate.
+min_wav_duration=0.1 # Minimum duration in second.
+max_wav_duration=20  # Maximum duration in second.
+
+# Tokenization related
+oov="<unk>"         # Out of vocabulary symbol.
+blank="<blank>"     # CTC blank symbol
+sos_eos="<sos/eos>" # sos and eos symbole
+token_joint=false       # whether to use a single bpe system for both source and target languages
+src_case=lc.rm
+src_token_type=bpe      # Tokenization type (char or bpe) for source languages.
+src_nbpe=30             # The number of BPE vocabulary for source language.
+src_bpemode=unigram     # Mode of BPE for source language (unigram or bpe).
+src_bpe_input_sentence_size=100000000 # Size of input sentence for BPE for source language.
+src_bpe_nlsyms=         # non-linguistic symbols list, separated by a comma, for BPE of source language
+src_bpe_char_cover=1.0  # character coverage when modeling BPE for source language
+tgt_case=tc
+tgt_token_type=bpe      # Tokenization type (char or bpe) for target language.
+tgt_nbpe=30             # The number of BPE vocabulary for target language.
+tgt_bpemode=unigram     # Mode of BPE (unigram or bpe) for target language.
+tgt_bpe_input_sentence_size=100000000 # Size of input sentence for BPE for target language.
+tgt_bpe_nlsyms=         # non-linguistic symbols list, separated by a comma, for BPE for target language.
+tgt_bpe_char_cover=1.0  # character coverage when modeling BPE for target language.
+
+# Ngram model related
+use_ngram=false
+ngram_exp=
+ngram_num=3
+
+# Language model related
+use_lm=true       # Use language model for ST decoding.
+lm_tag=           # Suffix to the result dir for language model training.
+lm_exp=           # Specify the directory path for LM experiment.
+                  # If this option is specified, lm_tag is ignored.
+lm_stats_dir=     # Specify the directory path for LM statistics.
+lm_config=        # Config for language model training.
+lm_args=          # Arguments for language model training, e.g., "--max_epoch 10".
+                  # Note that it will overwrite args in lm config.
+use_word_lm=false # Whether to use word language model.
+num_splits_lm=1   # Number of splitting for lm corpus.
+# shellcheck disable=SC2034
+word_vocab_size=10000 # Size of word vocabulary.
+
+# ST model related
+st_tag=        # Suffix to the result dir for st model training.
+st_exp=        # Specify the directory path for ST experiment.
+               # If this option is specified, st_tag is ignored.
+st_stats_dir=  # Specify the directory path for ST statistics.
+st_config=     # Config for st model training.
+st_args=       # Arguments for st model training, e.g., "--max_epoch 10".
+               # Note that it will overwrite args in st config.
+pretrained_asr=               # Pretrained model to load
+ignore_init_mismatch=false      # Ignore initial mismatch
+feats_normalize=global_mvn # Normalizaton layer type.
+num_splits_st=1            # Number of splitting for lm corpus.
+src_lang=es                # source language abbrev. id (e.g., es)
+tgt_lang=en                # target language abbrev. id (e.g., en)
+
+# Upload model related
+hf_repo=
+
+# Decoding related
+use_k2=false      # Whether to use k2 based decoder
+use_streaming=false # Whether to use streaming decoding
+batch_size=1
+inference_tag=    # Suffix to the result dir for decoding.
+inference_config= # Config for decoding.
+inference_args=   # Arguments for decoding, e.g., "--lm_weight 0.1".
+                  # Note that it will overwrite args in inference config.
+inference_lm=valid.loss.ave.pth       # Language model path for decoding.
+inference_ngram=${ngram_num}gram.bin
+inference_st_model=valid.acc.ave.pth # ST model path for decoding.
+                                      # e.g.
+                                      # inference_st_model=train.loss.best.pth
+                                      # inference_st_model=3epoch.pth
+                                      # inference_st_model=valid.acc.best.pth
+                                      # inference_st_model=valid.loss.ave.pth
+download_model= # Download a model from Model Zoo and use it for decoding.
+
+# [Task dependent] Set the datadir name created by local/data.sh
+train_set=       # Name of training set.
+valid_set=       # Name of validation set used for monitoring/tuning network training.
+test_sets=       # Names of test sets. Multiple items (e.g., both dev and eval sets) can be specified.
+src_bpe_train_text=  # Text file path of bpe training set for source language.
+tgt_bpe_train_text=  # Text file path of bpe training set for target language.
+lm_train_text=   # Text file path of language model training set.
+lm_dev_text=     # Text file path of language model development set.
+lm_test_text=    # Text file path of language model evaluation set.
+nlsyms_txt=none  # Non-linguistic symbol list if existing.
+cleaner=none     # Text cleaner.
+g2p=none         # g2p method (needed if token_type=phn).
+lang=noinfo      # The language type of corpus.
+score_opts=                # The options given to sclite scoring
+local_score_opts=          # The options given to local/score.sh.
+st_speech_fold_length=800 # fold_length for speech data during ST training.
+st_text_fold_length=150   # fold_length for text data during ST training.
+lm_fold_length=150         # fold_length for LM training.
+
+help_message=$(cat << EOF
+Usage: $0 --train-set "<train_set_name>" --valid-set "<valid_set_name>" --test_sets "<test_set_names>"
+
+Options:
+    # General configuration
+    --stage          # Processes starts from the specified stage (default="${stage}").
+    --stop_stage     # Processes is stopped at the specified stage (default="${stop_stage}").
+    --skip_data_prep # Skip data preparation stages (default="${skip_data_prep}").
+    --skip_train     # Skip training stages (default="${skip_train}").
+    --skip_eval      # Skip decoding and evaluation stages (default="${skip_eval}").
+    --skip_upload    # Skip packing and uploading stages (default="${skip_upload}").
+    --ngpu           # The number of gpus ("0" uses cpu, otherwise use gpu, default="${ngpu}").
+    --num_nodes      # The number of nodes (default="${num_nodes}").
+    --nj             # The number of parallel jobs (default="${nj}").
+    --inference_nj   # The number of parallel jobs in decoding (default="${inference_nj}").
+    --gpu_inference  # Whether to perform gpu decoding (default="${gpu_inference}").
+    --dumpdir        # Directory to dump features (default="${dumpdir}").
+    --expdir         # Directory to save experiments (default="${expdir}").
+    --python         # Specify python to execute espnet commands (default="${python}").
+
+    # Data preparation related
+    --local_data_opts # The options given to local/data.sh (default="${local_data_opts}").
+
+    # Speed perturbation related
+    --speed_perturb_factors # speed perturbation factors, e.g. "0.9 1.0 1.1" (separated by space, default="${speed_perturb_factors}").
+
+    # Feature extraction related
+    --feats_type       # Feature type (raw, fbank_pitch or extracted, default="${feats_type}").
+    --audio_format     # Audio format: wav, flac, wav.ark, flac.ark  (only in feats_type=raw, default="${audio_format}").
+    --fs               # Sampling rate (default="${fs}").
+    --min_wav_duration # Minimum duration in second (default="${min_wav_duration}").
+    --max_wav_duration # Maximum duration in second (default="${max_wav_duration}").
+
+    # Tokenization related
+    --oov                     # Out of vocabulary symbol (default="${oov}").
+    --blank                   # CTC blank symbol (default="${blank}").
+    --sos_eos                 # sos and eos symbole (default="${sos_eos}").
+    --token_joint=false       # Whether to use a single bpe system for both source and target languages.
+                              # if set as true, will use tgt_* for processing (default="${token_joint}").
+    --src_token_type=bpe      # Tokenization type (char or bpe) for source languages. (default="${src_token_type}").
+    --src_nbpe=30             # The number of BPE vocabulary for source language. (default="${src_nbpe}").
+    --src_bpemode=unigram     # Mode of BPE for source language (unigram or bpe). (default="${src_bpemode}").
+    --src_bpe_input_sentence_size=100000000 # Size of input sentence for BPE for source language. (default="${src_bpe_input_sentence_size}").
+    --src_bpe_nlsyms=         # Non-linguistic symbols list, separated by a comma, for BPE of source language. (default="${src_bpe_nlsyms}").
+    --src_bpe_char_cover=1.0  # Character coverage when modeling BPE for source language. (default="${src_bpe_char_cover}").
+    --tgt_token_type=bpe      # Tokenization type (char or bpe) for target language. (default="${tgt_token_type}").
+    --tgt_nbpe=30             # The number of BPE vocabulary for target language. (default="${tgt_nbpe}").
+    --tgt_bpemode=unigram     # Mode of BPE (unigram or bpe) for target language. (default="${tgt_bpemode}").
+    --tgt_bpe_input_sentence_size=100000000 # Size of input sentence for BPE for target language. (default="${tgt_bpe_input_sentence_size}").
+    --tgt_bpe_nlsyms=         # Non-linguistic symbols list, separated by a comma, for BPE for target language. (default="${tgt_bpe_nlsyms}").
+    --tgt_bpe_char_cover=1.0  # Character coverage when modeling BPE for target language. (default="${tgt_bpe_char_cover}").
+
+    # Language model related
+    --lm_tag          # Suffix to the result dir for language model training (default="${lm_tag}").
+    --lm_exp          # Specify the directory path for LM experiment.
+                      # If this option is specified, lm_tag is ignored (default="${lm_exp}").
+    --lm_stats_dir    # Specify the directory path for LM statistics (default="${lm_stats_dir}").
+    --lm_config       # Config for language model training (default="${lm_config}").
+    --lm_args         # Arguments for language model training (default="${lm_args}").
+                      # e.g., --lm_args "--max_epoch 10"
+                      # Note that it will overwrite args in lm config.
+    --use_word_lm     # Whether to use word language model (default="${use_word_lm}").
+    --word_vocab_size # Size of word vocabulary (default="${word_vocab_size}").
+    --num_splits_lm   # Number of splitting for lm corpus (default="${num_splits_lm}").
+
+    # ST model related
+    --st_tag           # Suffix to the result dir for st model training (default="${st_tag}").
+    --st_exp           # Specify the directory path for ST experiment.
+                       # If this option is specified, st_tag is ignored (default="${st_exp}").
+    --st_stats_dir     # Specify the directory path for ST statistics (default="${st_stats_dir}").
+    --st_config        # Config for st model training (default="${st_config}").
+    --st_args          # Arguments for st model training (default="${st_args}").
+                       # e.g., --st_args "--max_epoch 10"
+                       # Note that it will overwrite args in st config.
+    --pretrained_asr=          # Pretrained model to load (default="${pretrained_asr}").
+    --ignore_init_mismatch=      # Ignore mismatch parameter init with pretrained model (default="${ignore_init_mismatch}").
+    --feats_normalize  # Normalizaton layer type. (default="${feats_normalize}").
+    --num_splits_st    # Number of splitting for lm corpus.  (default="${num_splits_st}").
+    --src_lang=        # source language abbrev. id (e.g., es). (default="${src_lang}")
+    --tgt_lang=        # target language abbrev. id (e.g., en). (default="${tgt_lang}")
+
+    # Decoding related
+    --inference_tag       # Suffix to the result dir for decoding (default="${inference_tag}").
+    --inference_config    # Config for decoding (default="${inference_config}").
+    --inference_args      # Arguments for decoding (default="${inference_args}").
+                          # e.g., --inference_args "--lm_weight 0.1"
+                          # Note that it will overwrite args in inference config.
+    --inference_lm        # Language model path for decoding (default="${inference_lm}").
+    --inference_st_model # ST model path for decoding (default="${inference_st_model}").
+    --download_model      # Download a model from Model Zoo and use it for decoding (default="${download_model}").
+
+    # [Task dependent] Set the datadir name created by local/data.sh
+    --train_set     # Name of training set (required).
+    --valid_set     # Name of validation set used for monitoring/tuning network training (required).
+    --test_sets     # Names of test sets.
+                    # Multiple items (e.g., both dev and eval sets) can be specified (required).
+    --src_bpe_train_text # Text file path of bpe training set for source language.
+    --tgt_bpe_train_text # Text file path of bpe training set for target language
+    --lm_train_text  # Text file path of language model training set.
+    --lm_dev_text   # Text file path of language model development set (default="${lm_dev_text}").
+    --lm_test_text  # Text file path of language model evaluation set (default="${lm_test_text}").
+    --nlsyms_txt    # Non-linguistic symbol list if existing (default="${nlsyms_txt}").
+    --cleaner       # Text cleaner (default="${cleaner}").
+    --g2p           # g2p method (default="${g2p}").
+    --lang          # The language type of corpus (default=${lang}).
+    --score_opts             # The options given to sclite scoring (default="{score_opts}").
+    --local_score_opts       # The options given to local/score.sh (default="{local_score_opts}").
+    --st_speech_fold_length # fold_length for speech data during ST training (default="${st_speech_fold_length}").
+    --st_text_fold_length   # fold_length for text data during ST training (default="${st_text_fold_length}").
+    --lm_fold_length         # fold_length for LM training (default="${lm_fold_length}").
+EOF
+)
+
+log "$0 $*"
+# Save command line args for logging (they will be lost after utils/parse_options.sh)
+run_args=$(pyscripts/utils/print_args.py $0 "$@")
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    log "${help_message}"
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+. ./path.sh
+. ./cmd.sh
+
+
+# Check required arguments
+[ -z "${train_set}" ] && { log "${help_message}"; log "Error: --train_set is required"; exit 2; };
+[ -z "${valid_set}" ] && { log "${help_message}"; log "Error: --valid_set is required"; exit 2; };
+[ -z "${test_sets}" ] && { log "${help_message}"; log "Error: --test_sets is required"; exit 2; };
+
+# Check feature type
+if [ "${feats_type}" = raw ]; then
+    data_feats=${dumpdir}/raw
+elif [ "${feats_type}" = fbank_pitch ]; then
+    data_feats=${dumpdir}/fbank_pitch
+elif [ "${feats_type}" = fbank ]; then
+    data_feats=${dumpdir}/fbank
+elif [ "${feats_type}" == extracted ]; then
+    data_feats=${dumpdir}/extracted
+else
+    log "${help_message}"
+    log "Error: not supported: --feats_type ${feats_type}"
+    exit 2
+fi
+
+# Extra files for translation process
+utt_extra_files="text.${src_case}.${src_lang} text.${tgt_case}.${tgt_lang}"
+# Use the same text as ST for bpe training if not specified.
+[ -z "${src_bpe_train_text}" ] && src_bpe_train_text="${data_feats}/${train_set}/text.${src_case}.${src_lang}"
+[ -z "${tgt_bpe_train_text}" ] && tgt_bpe_train_text="${data_feats}/${train_set}/text.${tgt_case}.${tgt_lang}"
+# Use the same text as ST for lm training if not specified.
+[ -z "${lm_train_text}" ] && lm_train_text="${data_feats}/${train_set}/text.${tgt_case}.${tgt_lang}"
+# Use the same text as ST for lm training if not specified.
+[ -z "${lm_dev_text}" ] && lm_dev_text="${data_feats}/${valid_set}/text.${tgt_case}.${tgt_lang}"
+# Use the text of the 1st evaldir if lm_test is not specified
+[ -z "${lm_test_text}" ] && lm_test_text="${data_feats}/${test_sets%% *}/text.${tgt_case}.${tgt_lang}"
+
+# Check tokenization type
+if [ "${lang}" != noinfo ]; then
+    token_listdir=data/${lang}_token_list
+else
+    token_listdir=data/token_list
+fi
+# The tgt bpedir is set for all cases when using bpe
+tgt_bpedir="${token_listdir}/tgt_bpe_${tgt_bpemode}${tgt_nbpe}"
+tgt_bpeprefix="${tgt_bpedir}"/bpe
+tgt_bpemodel="${tgt_bpeprefix}".model
+tgt_bpetoken_list="${tgt_bpedir}"/tokens.txt
+tgt_chartoken_list="${token_listdir}"/char/tgt_tokens.txt
+if "${token_joint}"; then
+    # if token_joint, the bpe training will use both src_lang and tgt_lang to train a single bpe model
+    src_bpedir="${tgt_bpedir}"
+    src_bpeprefix="${tgt_bpeprefix}"
+    src_bpemodel="${tgt_bpemodel}"
+    src_bpetoken_list="${tgt_bpetoken_list}"
+    src_chartoken_list="${tgt_chartoken_list}"
+else
+    src_bpedir="${token_listdir}/src_bpe_${src_bpemode}${src_nbpe}"
+    src_bpeprefix="${src_bpedir}"/bpe
+    src_bpemodel="${src_bpeprefix}".model
+    src_bpetoken_list="${src_bpedir}"/tokens.txt
+    src_chartoken_list="${token_listdir}"/char/src_tokens.txt
+fi
+
+# NOTE: keep for future development.
+# shellcheck disable=SC2034
+tgt_wordtoken_list="${token_listdir}"/word/tgt_tokens.txt
+if "${token_joint}"; then
+    src_wordtoken_list="${tgt_wordtoken_list}"
+else
+    src_wordtoken_list="${token_listdir}"/word/src_tokens.txt
+fi
+
+# Set token types for src and tgt langs
+if [ "${src_token_type}" = bpe ]; then
+    src_token_list="${src_bpetoken_list}"
+elif [ "${src_token_type}" = char ]; then
+    src_token_list="${src_chartoken_list}"
+    src_bpemodel=none
+elif [ "${src_token_type}" = word ]; then
+    src_token_list="${src_wordtoken_list}"
+    src_bpemodel=none
+else
+    log "Error: not supported --src_token_type '${src_token_type}'"
+    exit 2
+fi
+if [ "${tgt_token_type}" = bpe ]; then
+    tgt_token_list="${tgt_bpetoken_list}"
+elif [ "${tgt_token_type}" = char ]; then
+    tgt_token_list="${tgt_chartoken_list}"
+    tgt_bpemodel=none
+elif [ "${tgt_token_type}" = word ]; then
+    tgt_token_list="${tgt_wordtoken_list}"
+    tgt_bpemodel=none
+else
+    log "Error: not supported --tgt_token_type '${tgt_token_type}'"
+    exit 2
+fi
+if ${use_word_lm}; then
+    log "Error: Word LM is not supported yet"
+    exit 2
+
+    lm_token_list="${tgt_wordtoken_list}"
+    lm_token_type=word
+else
+    lm_token_list="${tgt_token_list}"
+    lm_token_type="${tgt_token_type}"
+fi
+
+
+# Set tag for naming of model directory
+if [ -z "${st_tag}" ]; then
+    if [ -n "${st_config}" ]; then
+        st_tag="$(basename "${st_config}" .yaml)_${feats_type}"
+    else
+        st_tag="train_${feats_type}"
+    fi
+    if [ "${lang}" != noinfo ]; then
+        st_tag+="_${lang}_${tgt_token_type}_${tgt_case}"
+    else
+        st_tag+="_${tgt_token_type}_${tgt_case}"
+    fi
+    if [ "${tgt_token_type}" = bpe ]; then
+        st_tag+="${tgt_nbpe}"
+    fi
+    # Add overwritten arg's info
+    if [ -n "${st_args}" ]; then
+        st_tag+="$(echo "${st_args}" | sed -e "s/--/\_/g" -e "s/[ |=/]//g")"
+    fi
+    if [ -n "${speed_perturb_factors}" ]; then
+        st_tag+="_sp"
+    fi
+fi
+if [ -z "${lm_tag}" ]; then
+    if [ -n "${lm_config}" ]; then
+        lm_tag="$(basename "${lm_config}" .yaml)"
+    else
+        lm_tag="train"
+    fi
+    if [ "${lang}" != noinfo ]; then
+        lm_tag+="_${lang}_${lm_token_type}"
+    else
+        lm_tag+="_${lm_token_type}"
+    fi
+    if [ "${lm_token_type}" = bpe ]; then
+        lm_tag+="${tgt_nbpe}"
+    fi
+    # Add overwritten arg's info
+    if [ -n "${lm_args}" ]; then
+        lm_tag+="$(echo "${lm_args}" | sed -e "s/--/\_/g" -e "s/[ |=/]//g")"
+    fi
+fi
+
+# The directory used for collect-stats mode
+if [ -z "${st_stats_dir}" ]; then
+    if [ "${lang}" != noinfo ]; then
+        st_stats_dir="${expdir}/st_stats_${feats_type}_${lang}_${tgt_token_type}"
+    else
+        st_stats_dir="${expdir}/st_stats_${feats_type}_${tgt_token_type}"
+    fi
+    if [ "${tgt_token_type}" = bpe ]; then
+        st_stats_dir+="${tgt_nbpe}"
+    fi
+    if [ -n "${speed_perturb_factors}" ]; then
+        st_stats_dir+="_sp"
+    fi
+fi
+if [ -z "${lm_stats_dir}" ]; then
+    if [ "${lang}" != noinfo ]; then
+        lm_stats_dir="${expdir}/lm_stats_${lang}_${lm_token_type}"
+    else
+        lm_stats_dir="${expdir}/lm_stats_${lm_token_type}"
+    fi
+    if [ "${lm_token_type}" = bpe ]; then
+        lm_stats_dir+="${tgt_nbpe}"
+    fi
+fi
+# The directory used for training commands
+if [ -z "${st_exp}" ]; then
+    st_exp="${expdir}/st_${st_tag}"
+fi
+if [ -z "${lm_exp}" ]; then
+    lm_exp="${expdir}/lm_${lm_tag}"
+fi
+if [ -z "${ngram_exp}" ]; then
+    ngram_exp="${expdir}/ngram"
+fi
+
+
+if [ -z "${inference_tag}" ]; then
+    if [ -n "${inference_config}" ]; then
+        inference_tag="$(basename "${inference_config}" .yaml)"
+    else
+        inference_tag=inference
+    fi
+    # Add overwritten arg's info
+    if [ -n "${inference_args}" ]; then
+        inference_tag+="$(echo "${inference_args}" | sed -e "s/--/\_/g" -e "s/[ |=]//g")"
+    fi
+    if "${use_lm}"; then
+        inference_tag+="_lm_$(basename "${lm_exp}")_$(echo "${inference_lm}" | sed -e "s/\//_/g" -e "s/\.[^.]*$//g")"
+    fi
+    if "${use_ngram}"; then
+        inference_tag+="_ngram_$(basename "${ngram_exp}")_$(echo "${inference_ngram}" | sed -e "s/\//_/g" -e "s/\.[^.]*$//g")"
+    fi
+    inference_tag+="_st_model_$(echo "${inference_st_model}" | sed -e "s/\//_/g" -e "s/\.[^.]*$//g")"
+
+    if "${use_k2}"; then
+      inference_tag+="_use_k2"
+    fi
+fi
+
+# ========================== Main stages start from here. ==========================
+
+if ! "${skip_data_prep}"; then
+    if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+        log "Stage 1: Data preparation for data/${train_set}, data/${valid_set}, etc."
+        # [Task dependent] Need to create data.sh for new corpus
+        local/data.sh ${local_data_opts}
+    fi
+
+    if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+        if [ -n "${speed_perturb_factors}" ]; then
+            log "Stage 2: Speed perturbation: data/${train_set} -> data/${train_set}_sp"
+            for factor in ${speed_perturb_factors}; do
+                if [[ $(bc <<<"${factor} != 1.0") == 1 ]]; then
+                    scripts/utils/perturb_data_dir_speed.sh --utt_extra_files "${utt_extra_files}" \
+                         "${factor}" "data/${train_set}" "data/${train_set}_sp${factor}"
+                    _dirs+="data/${train_set}_sp${factor} "
+                else
+                    # If speed factor is 1, same as the original
+                    _dirs+="data/${train_set} "
+                fi
+            done
+            utils/combine_data.sh --extra_files "${utt_extra_files}" "data/${train_set}_sp" ${_dirs}
+            for extra_file in ${utt_extra_files}; do
+                python pyscripts/utils/remove_duplicate_keys.py data/"${train_set}_sp"/${extra_file} > data/"${train_set}_sp"/${extra_file}.tmp 
+                mv data/"${train_set}_sp"/${extra_file}.tmp data/"${train_set}_sp"/${extra_file}
+            done 
+        else
+           log "Skip stage 2: Speed perturbation"
+        fi
+    fi
+
+    if [ -n "${speed_perturb_factors}" ]; then
+        train_set="${train_set}_sp"
+    fi
+
+    if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+        if [ "${feats_type}" = raw ]; then
+            log "Stage 3: Format wav.scp: data/ -> ${data_feats}"
+
+            # ====== Recreating "wav.scp" ======
+            # Kaldi-wav.scp, which can describe the file path with unix-pipe, like "cat /some/path |",
+            # shouldn't be used in training process.
+            # "format_wav_scp.sh" dumps such pipe-style-wav to real audio file
+            # and it can also change the audio-format and sampling rate.
+            # If nothing is need, then format_wav_scp.sh does nothing:
+            # i.e. the input file format and rate is same as the output.
+
+            for dset in "${train_set}" "${valid_set}" ${test_sets}; do
+                if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
+                    _suf="/org"
+                else
+                    _suf=""
+                fi
+                utils/copy_data_dir.sh --validate_opts --non-print data/"${dset}" "${data_feats}${_suf}/${dset}"
+
+                # expand the utt_extra_files for multi-references
+                expand_utt_extra_files=""
+                for extra_file in ${utt_extra_files}; do
+                    # with regex to suuport multi-references
+                    for single_file in $(ls data/"${dset}"/${extra_file}*); do
+                        cp ${single_file} "${data_feats}${_suf}/${dset}"
+                        expand_utt_extra_files="${expand_utt_extra_files} $(basename ${single_file})"
+                    done 
+                done
+                echo "${expand_utt_extra_files}"
+                utils/fix_data_dir.sh --utt_extra_files "${expand_utt_extra_files}" "${data_feats}${_suf}/${dset}"
+                for extra_file in ${expand_utt_extra_files}; do
+                    LC_ALL=C sort -u -k1,1 "${data_feats}${_suf}/${dset}/${extra_file}" -o "${data_feats}${_suf}/${dset}/${extra_file}"
+                done
+
+                rm -f ${data_feats}${_suf}/${dset}/{segments,wav.scp,reco2file_and_channel,reco2dur}
+                _opts=
+                if [ -e data/"${dset}"/segments ]; then
+                    # "segments" is used for splitting wav files which are written in "wav".scp
+                    # into utterances. The file format of segments:
+                    #   <segment_id> <record_id> <start_time> <end_time>
+                    #   "e.g. call-861225-A-0050-0065 call-861225-A 5.0 6.5"
+                    # Where the time is written in seconds.
+                    _opts+="--segments data/${dset}/segments "
+                fi
+                # shellcheck disable=SC2086
+                scripts/audio/format_wav_scp.sh --nj "${nj}" --cmd "${train_cmd}" \
+                    --audio-format "${audio_format}" --fs "${fs}" ${_opts} \
+                    "data/${dset}/wav.scp" "${data_feats}${_suf}/${dset}"
+
+                echo "${feats_type}" > "${data_feats}${_suf}/${dset}/feats_type"
+            done
+
+        elif [ "${feats_type}" = fbank_pitch ]; then
+            log "[Require Kaldi] Stage 3: ${feats_type} extract: data/ -> ${data_feats}"
+
+            for dset in "${train_set}" "${valid_set}" ${test_sets}; do
+                if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
+                    _suf="/org"
+                else
+                    _suf=""
+                fi
+                # 1. Copy datadir
+                utils/copy_data_dir.sh --validate_opts --non-print data/"${dset}" "${data_feats}${_suf}/${dset}"
+
+                # expand the utt_extra_files for multi-references
+                expand_utt_extra_files=""
+                for extra_file in ${utt_extra_files}; do
+                    # with regex to suuport multi-references
+                    for single_file in $(ls data/"${dset}"/${extra_file}*); do
+                        cp ${single_file} "${data_feats}${_suf}/${dset}"
+                        expand_utt_extra_files="${expand_utt_extra_files} $(basename ${single_file})"
+                    done 
+                done
+                for extra_file in ${expand_utt_extra_files}; do
+                    LC_ALL=C sort -u -k1,1 "${data_feats}${_suf}/${dset}/${extra_file}" -o "${data_feats}${_suf}/${dset}/${extra_file}"
+                done
+
+                # 2. Feature extract
+                _nj=$(min "${nj}" "$(<"${data_feats}${_suf}/${dset}/utt2spk" wc -l)")
+                steps/make_fbank_pitch.sh --nj "${_nj}" --cmd "${train_cmd}" "${data_feats}${_suf}/${dset}"
+                utils/fix_data_dir.sh --utt_extra_files "${expand_utt_extra_files}*" "${data_feats}${_suf}/${dset}"
+
+                # 3. Derive the the frame length and feature dimension
+                scripts/feats/feat_to_shape.sh --nj "${_nj}" --cmd "${train_cmd}" \
+                    "${data_feats}${_suf}/${dset}/feats.scp" "${data_feats}${_suf}/${dset}/feats_shape"
+
+                # 4. Write feats_dim
+                head -n 1 "${data_feats}${_suf}/${dset}/feats_shape" | awk '{ print $2 }' \
+                    | cut -d, -f2 > ${data_feats}${_suf}/${dset}/feats_dim
+
+                # 5. Write feats_type
+                echo "${feats_type}" > "${data_feats}${_suf}/${dset}/feats_type"
+            done
+
+        elif [ "${feats_type}" = fbank ]; then
+            log "Stage 3: ${feats_type} extract: data/ -> ${data_feats}"
+            log "${feats_type} is not supported yet."
+            exit 1
+
+        elif  [ "${feats_type}" = extracted ]; then
+            log "Stage 3: ${feats_type} extract: data/ -> ${data_feats}"
+            # Assuming you don't have wav.scp, but feats.scp is created by local/data.sh instead.
+
+            for dset in "${train_set}" "${valid_set}" ${test_sets}; do
+                if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
+                    _suf="/org"
+                else
+                    _suf=""
+                fi
+                # Generate dummy wav.scp to avoid error by copy_data_dir.sh
+                <data/"${dset}"/cmvn.scp awk ' { print($1,"<DUMMY>") }' > data/"${dset}"/wav.scp
+                utils/copy_data_dir.sh --validate_opts --non-print data/"${dset}" "${data_feats}${_suf}/${dset}"
+
+                # expand the utt_extra_files for multi-references
+                expand_utt_extra_files=""
+                for extra_file in ${utt_extra_files}; do
+                    # with regex to suuport multi-references
+                    for single_file in $(ls data/"${dset}"/${extra_file}*); do
+                        cp ${single_file} "${data_feats}${_suf}/${dset}"
+                        expand_utt_extra_files="${expand_utt_extra_files} $(basename ${single_file})"
+                    done 
+                done
+                utils/fix_data_dir.sh --utt_extra_files "${expand_utt_extra_files}*" "${data_feats}${_suf}/${dset}"
+                for extra_file in ${expand_utt_extra_files}; do
+                    LC_ALL=C sort -u -k1,1 "${data_feats}${_suf}/${dset}/${extra_file}" -o "${data_feats}${_suf}/${dset}/${extra_file}"
+                done
+
+                # Derive the the frame length and feature dimension
+                _nj=$(min "${nj}" "$(<"${data_feats}${_suf}/${dset}/utt2spk" wc -l)")
+                scripts/feats/feat_to_shape.sh --nj "${_nj}" --cmd "${train_cmd}" \
+                    "${data_feats}${_suf}/${dset}/feats.scp" "${data_feats}${_suf}/${dset}/feats_shape"
+
+                pyscripts/feats/feat-to-shape.py "scp:head -n 1 ${data_feats}${_suf}/${dset}/feats.scp |" - | \
+                    awk '{ print $2 }' | cut -d, -f2 > "${data_feats}${_suf}/${dset}/feats_dim"
+
+                echo "${feats_type}" > "${data_feats}${_suf}/${dset}/feats_type"
+            done
+
+        else
+            log "Error: not supported: --feats_type ${feats_type}"
+            exit 2
+        fi
+    fi
+
+
+    if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+        log "Stage 4: Remove long/short data: ${data_feats}/org -> ${data_feats}"
+
+        # NOTE(kamo): Not applying to test_sets to keep original data
+        for dset in "${train_set}" "${valid_set}"; do
+            # Copy data dir
+            utils/copy_data_dir.sh --validate_opts --non-print "${data_feats}/org/${dset}" "${data_feats}/${dset}"
+            cp "${data_feats}/org/${dset}/feats_type" "${data_feats}/${dset}/feats_type"
+
+            for utt_extra_file in ${utt_extra_files}; do
+                cp "${data_feats}/org/${dset}/${utt_extra_file}" "${data_feats}/${dset}"
+            done
+            # Remove short utterances
+            _feats_type="$(<${data_feats}/${dset}/feats_type)"
+            if [ "${_feats_type}" = raw ]; then
+                _fs=$(python3 -c "import humanfriendly as h;print(h.parse_size('${fs}'))")
+                _min_length=$(python3 -c "print(int(${min_wav_duration} * ${_fs}))")
+                _max_length=$(python3 -c "print(int(${max_wav_duration} * ${_fs}))")
+
+                # utt2num_samples is created by format_wav_scp.sh
+                <"${data_feats}/org/${dset}/utt2num_samples" \
+                    awk -v min_length="${_min_length}" -v max_length="${_max_length}" \
+                        '{ if ($2 > min_length && $2 < max_length ) print $0; }' \
+                        >"${data_feats}/${dset}/utt2num_samples"
+                <"${data_feats}/org/${dset}/wav.scp" \
+                    utils/filter_scp.pl "${data_feats}/${dset}/utt2num_samples"  \
+                    >"${data_feats}/${dset}/wav.scp"
+            else
+                # Get frame shift in ms from conf/fbank.conf
+                _frame_shift=
+                if [ -f conf/fbank.conf ] && [ "$(<conf/fbank.conf grep -c frame-shift)" -gt 0 ]; then
+                    # Assume using conf/fbank.conf for feature extraction
+                    _frame_shift="$(<conf/fbank.conf grep frame-shift | sed -e 's/[-a-z =]*\([0-9]*\)/\1/g')"
+                fi
+                if [ -z "${_frame_shift}" ]; then
+                    # If not existing, use the default number in Kaldi (=10ms).
+                    # If you are using different number, you have to change the following value manually.
+                    _frame_shift=10
+                fi
+
+                _min_length=$(python3 -c "print(int(${min_wav_duration} / ${_frame_shift} * 1000))")
+                _max_length=$(python3 -c "print(int(${max_wav_duration} / ${_frame_shift} * 1000))")
+
+                cp "${data_feats}/org/${dset}/feats_dim" "${data_feats}/${dset}/feats_dim"
+                <"${data_feats}/org/${dset}/feats_shape" awk -F, ' { print $1 } ' \
+                    | awk -v min_length="${_min_length}" -v max_length="${_max_length}" \
+                        '{ if ($2 > min_length && $2 < max_length) print $0; }' \
+                        >"${data_feats}/${dset}/feats_shape"
+                <"${data_feats}/org/${dset}/feats.scp" \
+                    utils/filter_scp.pl "${data_feats}/${dset}/feats_shape"  \
+                    >"${data_feats}/${dset}/feats.scp"
+            fi
+
+            # Remove empty text
+            <"${data_feats}/org/${dset}/text" \
+                awk ' { if( NF != 1 ) print $0; } ' >"${data_feats}/${dset}/text"
+
+            # fix_data_dir.sh leaves only utts which exist in all files
+            utils/fix_data_dir.sh --utt_extra_files "${utt_extra_files}" "${data_feats}/${dset}"
+            for utt_extra_file in ${utt_extra_files}; do
+                python pyscripts/utils/remove_duplicate_keys.py ${data_feats}/${dset}/${utt_extra_file} \
+                    > ${data_feats}/${dset}/${utt_extra_file}.tmp 
+                mv ${data_feats}/${dset}/${utt_extra_file}.tmp ${data_feats}/${dset}/${utt_extra_file}
+            done 
+        done
+
+        # shellcheck disable=SC2002
+        cat ${lm_train_text} | awk ' { if( NF != 1 ) print $0; } ' > "${data_feats}/lm_train.txt"
+    fi
+
+    if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+        # Combine source and target texts when using joint tokenization
+        if "${token_joint}"; then
+            log "Merge src and target data if joint BPE"
+
+            cat $tgt_bpe_train_text > ${data_feats}/${train_set}/text.${src_lang}_${tgt_lang}
+            [ ! -z "${src_bpe_train_text}" ] && cat ${src_bpe_train_text} >> ${data_feats}/${train_set}/text.${src_lang}_${tgt_lang}
+            # Set the new text as the target text
+            tgt_bpe_train_text="${data_feats}/${train_set}/text.${src_lang}_${tgt_lang}"
+        fi
+
+        # First generate tgt lang
+        if [ "${tgt_token_type}" = bpe ]; then
+            log "Stage 5a: Generate token_list from ${tgt_bpe_train_text} using BPE for tgt_lang"
+
+            mkdir -p "${tgt_bpedir}"
+            # shellcheck disable=SC2002
+            cat ${tgt_bpe_train_text} | cut -f 2- -d" "  > "${tgt_bpedir}"/train.txt
+
+            if [ -n "${tgt_bpe_nlsyms}" ]; then
+                _opts_spm="--user_defined_symbols=${tgt_bpe_nlsyms}"
+            else
+                _opts_spm=""
+            fi
+
+            spm_train \
+                --input="${tgt_bpedir}"/train.txt \
+                --vocab_size="${tgt_nbpe}" \
+                --model_type="${tgt_bpemode}" \
+                --model_prefix="${tgt_bpeprefix}" \
+                --character_coverage=${tgt_bpe_char_cover} \
+                --input_sentence_size="${tgt_bpe_input_sentence_size}" \
+                ${_opts_spm}
+
+            {
+            echo "${blank}"
+            echo "${oov}"
+            # Remove <unk>, <s>, </s> from the vocabulary
+            <"${tgt_bpeprefix}".vocab awk '{ if( NR != 1 && NR != 2 && NR != 3 ){ print $1; } }'
+            echo "${sos_eos}"
+            } > "${tgt_token_list}"
+
+        elif [ "${tgt_token_type}" = char ] || [ "${tgt_token_type}" = word ]; then
+            log "Stage 5a: Generate character level token_list from ${tgt_bpe_train_text}  for tgt_lang"
+
+            _opts="--non_linguistic_symbols ${nlsyms_txt}"
+
+            # shellcheck disable=SC2002
+            cat ${tgt_bpe_train_text} | cut -f 2- -d" "  > "${data_feats}"/token_train.txt
+
+            # The first symbol in token_list must be "<blank>" and the last must be also sos/eos:
+            # 0 is reserved for CTC-blank for ST and also used as ignore-index in the other task
+            ${python} -m espnet2.bin.tokenize_text  \
+                --token_type "${tgt_token_type}" \
+                --input "${data_feats}/token_train.txt" --output "${tgt_token_list}" ${_opts} \
+                --field 2- \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --write_vocabulary true \
+                --add_symbol "${blank}:0" \
+                --add_symbol "${oov}:1" \
+                --add_symbol "${sos_eos}:-1"
+
+        else
+            log "Error: not supported --token_type '${tgt_token_type}'"
+            exit 2
+        fi
+
+        # Create word-list for word-LM training
+        if ${use_word_lm} && [ "${tgt_token_type}" != word ]; then
+            log "Generate word level token_list from ${data_feats}/lm_train.txt"
+            ${python} -m espnet2.bin.tokenize_text \
+                --token_type word \
+                --input "${data_feats}/lm_train.txt" --output "${lm_token_list}" \
+                --field 2- \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --write_vocabulary true \
+                --vocabulary_size "${word_vocab_size}" \
+                --add_symbol "${blank}:0" \
+                --add_symbol "${oov}:1" \
+                --add_symbol "${sos_eos}:-1"
+        fi
+
+        # Then generate src lang
+        if "${token_joint}"; then
+            log "Stage 5b: Skip separate token construction for src_lang when setting ${token_joint} as true"
+        else
+            if [ "${src_token_type}" = bpe ]; then
+                log "Stage 5b: Generate token_list from ${src_bpe_train_text} using BPE for src_lang"
+
+                mkdir -p "${src_bpedir}"
+                # shellcheck disable=SC2002
+                cat ${src_bpe_train_text} | cut -f 2- -d" "  > "${src_bpedir}"/train.txt
+
+                if [ -n "${src_bpe_nlsyms}" ]; then
+                    _opts_spm="--user_defined_symbols=${src_bpe_nlsyms}"
+                else
+                    _opts_spm=""
+                fi
+
+                spm_train \
+                    --input="${src_bpedir}"/train.txt \
+                    --vocab_size="${src_nbpe}" \
+                    --model_type="${src_bpemode}" \
+                    --model_prefix="${src_bpeprefix}" \
+                    --character_coverage=${src_bpe_char_cover} \
+                    --input_sentence_size="${src_bpe_input_sentence_size}" \
+                    ${_opts_spm}
+
+                {
+                echo "${blank}"
+                echo "${oov}"
+                # Remove <unk>, <s>, </s> from the vocabulary
+                <"${src_bpeprefix}".vocab awk '{ if( NR != 1 && NR != 2 && NR != 3 ){ print $1; } }'
+                echo "${sos_eos}"
+                } > "${src_token_list}"
+
+            elif [ "${src_token_type}" = char ] || [ "${src_token_type}" = word ]; then
+                log "Stage 5b: Generate character level token_list from ${src_bpe_train_text}  for src_lang"
+
+                _opts="--non_linguistic_symbols ${nlsyms_txt}"
+
+                # shellcheck disable=SC2002
+                cat ${src_bpe_train_text} | cut -f 2- -d" "  > "${data_feats}"/token_train.txt
+
+                # The first symbol in token_list must be "<blank>" and the last must be also sos/eos:
+                # 0 is reserved for CTC-blank for ST and also used as ignore-index in the other task
+                ${python} -m espnet2.bin.tokenize_text  \
+                    --token_type "${src_token_type}" \
+                    --input "${data_feats}/token_train.txt" --output "${src_token_list}" ${_opts} \
+                    --field 2- \
+                    --cleaner "${cleaner}" \
+                    --g2p "${g2p}" \
+                    --write_vocabulary true \
+                    --add_symbol "${blank}:0" \
+                    --add_symbol "${oov}:1" \
+                    --add_symbol "${sos_eos}:-1"
+
+            else
+                log "Error: not supported --token_type '${src_token_type}'"
+                exit 2
+            fi
+
+
+        fi
+    fi
+
+else
+    log "Skip the stages for data preparation"
+fi
+
+
+# ========================== Data preparation is done here. ==========================
+
+
+if ! "${skip_train}"; then
+    if "${use_lm}"; then
+        if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+            log "Stage 6: LM collect stats: train_set=${data_feats}/lm_train.txt, dev_set=${lm_dev_text}"
+
+            _opts=
+            if [ -n "${lm_config}" ]; then
+                # To generate the config file: e.g.
+                #   % python3 -m espnet2.bin.lm_train --print_config --optim adam
+                _opts+="--config ${lm_config} "
+            fi
+
+            # 1. Split the key file
+            _logdir="${lm_stats_dir}/logdir"
+            mkdir -p "${_logdir}"
+            # Get the minimum number among ${nj} and the number lines of input files
+            _nj=$(min "${nj}" "$(<${data_feats}/lm_train.txt wc -l)" "$(<${lm_dev_text} wc -l)")
+
+            key_file="${data_feats}/lm_train.txt"
+            split_scps=""
+            for n in $(seq ${_nj}); do
+                split_scps+=" ${_logdir}/train.${n}.scp"
+            done
+            # shellcheck disable=SC2086
+            utils/split_scp.pl "${key_file}" ${split_scps}
+
+            key_file="${lm_dev_text}"
+            split_scps=""
+            for n in $(seq ${_nj}); do
+                split_scps+=" ${_logdir}/dev.${n}.scp"
+            done
+            # shellcheck disable=SC2086
+            utils/split_scp.pl "${key_file}" ${split_scps}
+
+            # 2. Generate run.sh
+            log "Generate '${lm_stats_dir}/run.sh'. You can resume the process from stage 6 using this script"
+            mkdir -p "${lm_stats_dir}"; echo "${run_args} --stage 6 \"\$@\"; exit \$?" > "${lm_stats_dir}/run.sh"; chmod +x "${lm_stats_dir}/run.sh"
+
+            # 3. Submit jobs
+            log "LM collect-stats started... log: '${_logdir}/stats.*.log'"
+            # NOTE: --*_shape_file doesn't require length information if --batch_type=unsorted,
+            #       but it's used only for deciding the sample ids.
+            # shellcheck disable=SC2086
+            ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \
+                ${python} -m espnet2.bin.lm_train \
+                    --collect_stats true \
+                    --use_preprocessor true \
+                    --bpemodel "${tgt_bpemodel}" \
+                    --token_type "${lm_token_type}"\
+                    --token_list "${lm_token_list}" \
+                    --non_linguistic_symbols "${nlsyms_txt}" \
+                    --cleaner "${cleaner}" \
+                    --g2p "${g2p}" \
+                    --train_data_path_and_name_and_type "${data_feats}/lm_train.txt,text,text" \
+                    --valid_data_path_and_name_and_type "${lm_dev_text},text,text" \
+                    --train_shape_file "${_logdir}/train.JOB.scp" \
+                    --valid_shape_file "${_logdir}/dev.JOB.scp" \
+                    --output_dir "${_logdir}/stats.JOB" \
+                    ${_opts} ${lm_args} || { cat "${_logdir}"/stats.1.log; exit 1; }
+
+            # 4. Aggregate shape files
+            _opts=
+            for i in $(seq "${_nj}"); do
+                _opts+="--input_dir ${_logdir}/stats.${i} "
+            done
+            # shellcheck disable=SC2086
+            ${python} -m espnet2.bin.aggregate_stats_dirs ${_opts} --output_dir "${lm_stats_dir}"
+
+            # Append the num-tokens at the last dimensions. This is used for batch-bins count
+            <"${lm_stats_dir}/train/text_shape" \
+                awk -v N="$(<${lm_token_list} wc -l)" '{ print $0 "," N }' \
+                >"${lm_stats_dir}/train/text_shape.${lm_token_type}"
+
+            <"${lm_stats_dir}/valid/text_shape" \
+                awk -v N="$(<${lm_token_list} wc -l)" '{ print $0 "," N }' \
+                >"${lm_stats_dir}/valid/text_shape.${lm_token_type}"
+        fi
+
+
+        if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+            log "Stage 7: LM Training: train_set=${data_feats}/lm_train.txt, dev_set=${lm_dev_text}"
+
+            _opts=
+            if [ -n "${lm_config}" ]; then
+                # To generate the config file: e.g.
+                #   % python3 -m espnet2.bin.lm_train --print_config --optim adam
+                _opts+="--config ${lm_config} "
+            fi
+
+            if [ "${num_splits_lm}" -gt 1 ]; then
+                # If you met a memory error when parsing text files, this option may help you.
+                # The corpus is split into subsets and each subset is used for training one by one in order,
+                # so the memory footprint can be limited to the memory required for each dataset.
+
+                _split_dir="${lm_stats_dir}/splits${num_splits_lm}"
+                if [ ! -f "${_split_dir}/.done" ]; then
+                    rm -f "${_split_dir}/.done"
+                    ${python} -m espnet2.bin.split_scps \
+                      --scps "${data_feats}/lm_train.txt" "${lm_stats_dir}/train/text_shape.${lm_token_type}" \
+                      --num_splits "${num_splits_lm}" \
+                      --output_dir "${_split_dir}"
+                    touch "${_split_dir}/.done"
+                else
+                    log "${_split_dir}/.done exists. Spliting is skipped"
+                fi
+
+                _opts+="--train_data_path_and_name_and_type ${_split_dir}/lm_train.txt,text,text "
+                _opts+="--train_shape_file ${_split_dir}/text_shape.${lm_token_type} "
+                _opts+="--multiple_iterator true "
+
+            else
+                _opts+="--train_data_path_and_name_and_type ${data_feats}/lm_train.txt,text,text "
+                _opts+="--train_shape_file ${lm_stats_dir}/train/text_shape.${lm_token_type} "
+            fi
+
+            # NOTE(kamo): --fold_length is used only if --batch_type=folded and it's ignored in the other case
+
+            log "Generate '${lm_exp}/run.sh'. You can resume the process from stage 7 using this script"
+            mkdir -p "${lm_exp}"; echo "${run_args} --stage 7 \"\$@\"; exit \$?" > "${lm_exp}/run.sh"; chmod +x "${lm_exp}/run.sh"
+
+            log "LM training started... log: '${lm_exp}/train.log'"
+            if echo "${cuda_cmd}" | grep -e queue.pl -e queue-freegpu.pl &> /dev/null; then
+                # SGE can't include "/" in a job name
+                jobname="$(basename ${lm_exp})"
+            else
+                jobname="${lm_exp}/train.log"
+            fi
+
+            # TODO(jiatong): fix bpe
+            # shellcheck disable=SC2086
+            ${python} -m espnet2.bin.launch \
+                --cmd "${cuda_cmd} --name ${jobname}" \
+                --log "${lm_exp}"/train.log \
+                --ngpu "${ngpu}" \
+                --num_nodes "${num_nodes}" \
+                --init_file_prefix "${lm_exp}"/.dist_init_ \
+                --multiprocessing_distributed true -- \
+                ${python} -m espnet2.bin.lm_train \
+                    --ngpu "${ngpu}" \
+                    --use_preprocessor true \
+                    --bpemodel "${tgt_bpemodel}" \
+                    --token_type "${lm_token_type}"\
+                    --token_list "${lm_token_list}" \
+                    --non_linguistic_symbols "${nlsyms_txt}" \
+                    --cleaner "${cleaner}" \
+                    --g2p "${g2p}" \
+                    --valid_data_path_and_name_and_type "${lm_dev_text},text,text" \
+                    --valid_shape_file "${lm_stats_dir}/valid/text_shape.${lm_token_type}" \
+                    --fold_length "${lm_fold_length}" \
+                    --resume true \
+                    --output_dir "${lm_exp}" \
+                    ${_opts} ${lm_args}
+
+        fi
+
+
+        if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
+            log "Stage 8: Calc perplexity: ${lm_test_text}"
+            _opts=
+            # TODO(kamo): Parallelize?
+            log "Perplexity calculation started... log: '${lm_exp}/perplexity_test/lm_calc_perplexity.log'"
+            # shellcheck disable=SC2086
+            ${cuda_cmd} --gpu "${ngpu}" "${lm_exp}"/perplexity_test/lm_calc_perplexity.log \
+                ${python} -m espnet2.bin.lm_calc_perplexity \
+                    --ngpu "${ngpu}" \
+                    --data_path_and_name_and_type "${lm_test_text},text,text" \
+                    --train_config "${lm_exp}"/config.yaml \
+                    --model_file "${lm_exp}/${inference_lm}" \
+                    --output_dir "${lm_exp}/perplexity_test" \
+                    ${_opts}
+            log "PPL: ${lm_test_text}: $(cat ${lm_exp}/perplexity_test/ppl)"
+
+        fi
+
+    else
+        log "Stage 6-8: Skip lm-related stages: use_lm=${use_lm}"
+    fi
+
+
+    if "${use_ngram}"; then
+        mkdir -p ${ngram_exp}
+    fi
+    if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
+        if "${use_ngram}"; then
+            log "Stage 9: Ngram Training: train_set=${data_feats}/lm_train.txt"
+            cut -f 2 -d " " ${data_feats}/lm_train.txt | lmplz -S "20%" --discount_fallback -o ${ngram_num} - >${ngram_exp}/${ngram_num}gram.arpa
+            build_binary -s ${ngram_exp}/${ngram_num}gram.arpa ${ngram_exp}/${ngram_num}gram.bin 
+        else
+            log "Stage 9: Skip ngram stages: use_ngram=${use_ngram}"
+        fi
+    fi
+
+
+    if [ ${stage} -le 10 ] && [ ${stop_stage} -ge 10 ]; then
+        _st_train_dir="${data_feats}/${train_set}"
+        _st_valid_dir="${data_feats}/${valid_set}"
+        log "Stage 10: ST collect stats: train_set=${_st_train_dir}, valid_set=${_st_valid_dir}"
+
+        _opts=
+        if [ -n "${st_config}" ]; then
+            # To generate the config file: e.g.
+            #   % python3 -m espnet2.bin.st_train --print_config --optim adam
+            _opts+="--config ${st_config} "
+        fi
+
+        _feats_type="$(<${_st_train_dir}/feats_type)"
+        if [ "${_feats_type}" = raw ]; then
+            _scp=wav.scp
+            if [[ "${audio_format}" == *ark* ]]; then
+                _type=kaldi_ark
+            else
+                # "sound" supports "wav", "flac", etc.
+                _type=sound
+            fi
+            _opts+="--frontend_conf fs=${fs} "
+        else
+            _scp=feats.scp
+            _type=kaldi_ark
+            _input_size="$(<${_st_train_dir}/feats_dim)"
+            _opts+="--input_size=${_input_size} "
+        fi
+
+        # 1. Split the key file
+        _logdir="${st_stats_dir}/logdir"
+        mkdir -p "${_logdir}"
+
+        # Get the minimum number among ${nj} and the number lines of input files
+        _nj=$(min "${nj}" "$(<${_st_train_dir}/${_scp} wc -l)" "$(<${_st_valid_dir}/${_scp} wc -l)")
+
+        key_file="${_st_train_dir}/${_scp}"
+        split_scps=""
+        for n in $(seq "${_nj}"); do
+            split_scps+=" ${_logdir}/train.${n}.scp"
+        done
+        # shellcheck disable=SC2086
+        utils/split_scp.pl "${key_file}" ${split_scps}
+
+        key_file="${_st_valid_dir}/${_scp}"
+        split_scps=""
+        for n in $(seq "${_nj}"); do
+            split_scps+=" ${_logdir}/valid.${n}.scp"
+        done
+        # shellcheck disable=SC2086
+        utils/split_scp.pl "${key_file}" ${split_scps}
+
+        # 2. Generate run.sh
+        log "Generate '${st_stats_dir}/run.sh'. You can resume the process from stage 10 using this script"
+        mkdir -p "${st_stats_dir}"; echo "${run_args} --stage 10 \"\$@\"; exit \$?" > "${st_stats_dir}/run.sh"; chmod +x "${st_stats_dir}/run.sh"
+
+        # 3. Submit jobs
+        log "ST collect-stats started... log: '${_logdir}/stats.*.log'"
+
+        # NOTE: --*_shape_file doesn't require length information if --batch_type=unsorted,
+        #       but it's used only for deciding the sample ids.
+
+        # TODO(jiatong): fix different bpe model
+        # shellcheck disable=SC2086
+        ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \
+            ${python} -m espnet2.bin.st_train \
+                --collect_stats true \
+                --use_preprocessor true \
+                --bpemodel "${tgt_bpemodel}" \
+                --src_bpemodel "${src_bpemodel}" \
+                --token_type "${tgt_token_type}" \
+                --src_token_type "${src_token_type}" \
+                --token_list "${tgt_token_list}" \
+                --src_token_list "${src_token_list}" \
+                --non_linguistic_symbols "${nlsyms_txt}" \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --train_data_path_and_name_and_type "${_st_train_dir}/${_scp},speech,${_type}" \
+                --train_data_path_and_name_and_type "${_st_train_dir}/text.${tgt_case}.${tgt_lang},text,text" \
+                --train_data_path_and_name_and_type "${_st_train_dir}/text.${src_case}.${src_lang},src_text,text" \
+                --valid_data_path_and_name_and_type "${_st_valid_dir}/${_scp},speech,${_type}" \
+                --valid_data_path_and_name_and_type "${_st_valid_dir}/text.${tgt_case}.${tgt_lang},text,text" \
+                --valid_data_path_and_name_and_type "${_st_valid_dir}/text.${src_case}.${src_lang},src_text,text" \
+                --train_shape_file "${_logdir}/train.JOB.scp" \
+                --valid_shape_file "${_logdir}/valid.JOB.scp" \
+                --output_dir "${_logdir}/stats.JOB" \
+                ${_opts} ${st_args} || { cat "${_logdir}"/stats.1.log; exit 1; }
+
+        # 4. Aggregate shape files
+        _opts=
+        for i in $(seq "${_nj}"); do
+            _opts+="--input_dir ${_logdir}/stats.${i} "
+        done
+        # shellcheck disable=SC2086
+        ${python} -m espnet2.bin.aggregate_stats_dirs ${_opts} --output_dir "${st_stats_dir}"
+
+        # Append the num-tokens at the last dimensions. This is used for batch-bins count
+        <"${st_stats_dir}/train/text_shape" \
+            awk -v N="$(<${tgt_token_list} wc -l)" '{ print $0 "," N }' \
+            >"${st_stats_dir}/train/text_shape.${tgt_token_type}"
+
+        <"${st_stats_dir}/train/src_text_shape" \
+            awk -v N="$(<${src_token_list} wc -l)" '{ print $0 "," N }' \
+            >"${st_stats_dir}/train/src_text_shape.${src_token_type}"
+
+        <"${st_stats_dir}/valid/text_shape" \
+            awk -v N="$(<${tgt_token_list} wc -l)" '{ print $0 "," N }' \
+            >"${st_stats_dir}/valid/text_shape.${tgt_token_type}"
+
+        <"${st_stats_dir}/valid/src_text_shape" \
+            awk -v N="$(<${src_token_list} wc -l)" '{ print $0 "," N }' \
+            >"${st_stats_dir}/valid/src_text_shape.${src_token_type}"
+    fi
+
+
+    if [ ${stage} -le 11 ] && [ ${stop_stage} -ge 11 ]; then
+        _st_train_dir="${data_feats}/${train_set}"
+        _st_valid_dir="${data_feats}/${valid_set}"
+        log "Stage 11: ST Training: train_set=${_st_train_dir}, valid_set=${_st_valid_dir}"
+
+        _opts=
+        if [ -n "${st_config}" ]; then
+            # To generate the config file: e.g.
+            #   % python3 -m espnet2.bin.st_train --print_config --optim adam
+            _opts+="--config ${st_config} "
+        fi
+
+        _feats_type="$(<${_st_train_dir}/feats_type)"
+        if [ "${_feats_type}" = raw ]; then
+            _scp=wav.scp
+            # "sound" supports "wav", "flac", etc.
+            if [[ "${audio_format}" == *ark* ]]; then
+                _type=kaldi_ark
+            else
+                _type=sound
+            fi
+            _fold_length="$((st_speech_fold_length * 100))"
+            _opts+="--frontend_conf fs=${fs} "
+        else
+            _scp=feats.scp
+            _type=kaldi_ark
+            _fold_length="${st_speech_fold_length}"
+            _input_size="$(<${_st_train_dir}/feats_dim)"
+            _opts+="--input_size=${_input_size} "
+
+        fi
+        if [ "${feats_normalize}" = global_mvn ]; then
+            # Default normalization is utterance_mvn and changes to global_mvn
+            _opts+="--normalize=global_mvn --normalize_conf stats_file=${st_stats_dir}/train/feats_stats.npz "
+        fi
+
+        if [ "${num_splits_st}" -gt 1 ]; then
+            # If you met a memory error when parsing text files, this option may help you.
+            # The corpus is split into subsets and each subset is used for training one by one in order,
+            # so the memory footprint can be limited to the memory required for each dataset.
+
+            _split_dir="${st_stats_dir}/splits${num_splits_st}"
+            if [ ! -f "${_split_dir}/.done" ]; then
+                rm -f "${_split_dir}/.done"
+                ${python} -m espnet2.bin.split_scps \
+                  --scps \
+                      "${_st_train_dir}/${_scp}" \
+                      "${_st_train_dir}/text.${tgt_case}.${tgt_lang}" \
+                      "${_st_train_dir}/text.${src_case}.${src_lang}" \
+                      "${st_stats_dir}/train/speech_shape" \
+                      "${st_stats_dir}/train/text_shape.${tgt_token_type}" \
+                      "${st_stats_dir}/train/src_text_shape.${src_token_type}" \
+                  --num_splits "${num_splits_st}" \
+                  --output_dir "${_split_dir}"
+                touch "${_split_dir}/.done"
+            else
+                log "${_split_dir}/.done exists. Spliting is skipped"
+            fi
+
+            _opts+="--train_data_path_and_name_and_type ${_split_dir}/${_scp},speech,${_type} "
+            _opts+="--train_data_path_and_name_and_type ${_split_dir}/text.${tgt_case}.${tgt_lang},text,text "
+            _opts+="--train_data_path_and_name_and_type ${_split_dir}/text.${src_case}.${src_lang},src_text,text "
+            _opts+="--train_shape_file ${_split_dir}/speech_shape "
+            _opts+="--train_shape_file ${_split_dir}/text_shape.${tgt_token_type} "
+            _opts+="--train_shape_file ${_split_dir}/src_text_shape.${src_token_type} "
+            _opts+="--multiple_iterator true "
+        else
+            _opts+="--train_data_path_and_name_and_type ${_st_train_dir}/${_scp},speech,${_type} "
+            _opts+="--train_data_path_and_name_and_type ${_st_train_dir}/text.${tgt_case}.${tgt_lang},text,text "
+            _opts+="--train_data_path_and_name_and_type ${_st_train_dir}/text.${src_case}.${src_lang},src_text,text "
+            _opts+="--train_shape_file ${st_stats_dir}/train/speech_shape "
+            _opts+="--train_shape_file ${st_stats_dir}/train/text_shape.${tgt_token_type} "
+            _opts+="--train_shape_file ${st_stats_dir}/train/src_text_shape.${src_token_type} "
+        fi
+
+        log "Generate '${st_exp}/run.sh'. You can resume the process from stage 11 using this script"
+        mkdir -p "${st_exp}"; echo "${run_args} --stage 11 \"\$@\"; exit \$?" > "${st_exp}/run.sh"; chmod +x "${st_exp}/run.sh"
+
+        # NOTE(kamo): --fold_length is used only if --batch_type=folded and it's ignored in the other case
+        log "ST training started... log: '${st_exp}/train.log'"
+        if echo "${cuda_cmd}" | grep -e queue.pl -e queue-freegpu.pl &> /dev/null; then
+            # SGE can't include "/" in a job name
+            jobname="$(basename ${st_exp})"
+        else
+            jobname="${st_exp}/train.log"
+        fi
+
+        # TODO(jiatong): fix bpe
+        # shellcheck disable=SC2086
+        ${python} -m espnet2.bin.launch \
+            --cmd "${cuda_cmd} --name ${jobname}" \
+            --log "${st_exp}"/train.log \
+            --ngpu "${ngpu}" \
+            --num_nodes "${num_nodes}" \
+            --init_file_prefix "${st_exp}"/.dist_init_ \
+            --multiprocessing_distributed true -- \
+            ${python} -m espnet2.bin.st_train \
+                --use_preprocessor true \
+                --bpemodel "${tgt_bpemodel}" \
+                --token_type "${tgt_token_type}" \
+                --token_list "${tgt_token_list}" \
+                --src_bpemodel "${src_bpemodel}" \
+                --src_token_type "${src_token_type}" \
+                --src_token_list "${src_token_list}" \
+                --non_linguistic_symbols "${nlsyms_txt}" \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --valid_data_path_and_name_and_type "${_st_valid_dir}/${_scp},speech,${_type}" \
+                --valid_data_path_and_name_and_type "${_st_valid_dir}/text.${tgt_case}.${tgt_lang},text,text" \
+                --valid_data_path_and_name_and_type "${_st_valid_dir}/text.${src_case}.${src_lang},src_text,text" \
+                --valid_shape_file "${st_stats_dir}/valid/speech_shape" \
+                --valid_shape_file "${st_stats_dir}/valid/text_shape.${tgt_token_type}" \
+                --valid_shape_file "${st_stats_dir}/valid/src_text_shape.${src_token_type}" \
+                --resume true \
+                --init_param ${pretrained_asr} \
+                --ignore_init_mismatch ${ignore_init_mismatch} \
+                --fold_length "${_fold_length}" \
+                --fold_length "${st_text_fold_length}" \
+                --fold_length "${st_text_fold_length}" \
+                --output_dir "${st_exp}" \
+                ${_opts} ${st_args}
+
+    fi
+else
+    log "Skip the training stages"
+fi
+
+
+if [ -n "${download_model}" ]; then
+    log "Use ${download_model} for decoding and evaluation"
+    st_exp="${expdir}/${download_model}"
+    mkdir -p "${st_exp}"
+
+    # If the model already exists, you can skip downloading
+    espnet_model_zoo_download --unpack true "${download_model}" > "${st_exp}/config.txt"
+
+    # Get the path of each file
+    _st_model_file=$(<"${st_exp}/config.txt" sed -e "s/.*'st_model_file': '\([^']*\)'.*$/\1/")
+    _st_train_config=$(<"${st_exp}/config.txt" sed -e "s/.*'st_train_config': '\([^']*\)'.*$/\1/")
+
+    # Create symbolic links
+    ln -sf "${_st_model_file}" "${st_exp}"
+    ln -sf "${_st_train_config}" "${st_exp}"
+    inference_st_model=$(basename "${_st_model_file}")
+
+    if [ "$(<${st_exp}/config.txt grep -c lm_file)" -gt 0 ]; then
+        _lm_file=$(<"${st_exp}/config.txt" sed -e "s/.*'lm_file': '\([^']*\)'.*$/\1/")
+        _lm_train_config=$(<"${st_exp}/config.txt" sed -e "s/.*'lm_train_config': '\([^']*\)'.*$/\1/")
+
+        lm_exp="${expdir}/${download_model}/lm"
+        mkdir -p "${lm_exp}"
+
+        ln -sf "${_lm_file}" "${lm_exp}"
+        ln -sf "${_lm_train_config}" "${lm_exp}"
+        inference_lm=$(basename "${_lm_file}")
+    fi
+
+fi
+
+
+if ! "${skip_eval}"; then
+    if [ ${stage} -le 12 ] && [ ${stop_stage} -ge 12 ]; then
+        log "Stage 12: Decoding: training_dir=${st_exp}"
+
+        if ${gpu_inference}; then
+            _cmd="${cuda_cmd}"
+            _ngpu=1
+        else
+            _cmd="${decode_cmd}"
+            _ngpu=0
+        fi
+
+        _opts=
+        if [ -n "${inference_config}" ]; then
+            _opts+="--config ${inference_config} "
+        fi
+        if "${use_lm}"; then
+            if "${use_word_lm}"; then
+                _opts+="--word_lm_train_config ${lm_exp}/config.yaml "
+                _opts+="--word_lm_file ${lm_exp}/${inference_lm} "
+            else
+                _opts+="--lm_train_config ${lm_exp}/config.yaml "
+                _opts+="--lm_file ${lm_exp}/${inference_lm} "
+            fi
+        fi
+        if "${use_ngram}"; then
+             _opts+="--ngram_file ${ngram_exp}/${inference_ngram}"
+        fi
+
+        # 2. Generate run.sh
+        log "Generate '${st_exp}/${inference_tag}/run.sh'. You can resume the process from stage 12 using this script"
+        mkdir -p "${st_exp}/${inference_tag}"; echo "${run_args} --stage 12 \"\$@\"; exit \$?" > "${st_exp}/${inference_tag}/run.sh"; chmod +x "${st_exp}/${inference_tag}/run.sh"
+
+        for dset in ${test_sets}; do
+            _data="${data_feats}/${dset}"
+            _dir="${st_exp}/${inference_tag}/${dset}"
+            _logdir="${_dir}/logdir"
+            mkdir -p "${_logdir}"
+
+            _feats_type="$(<${_data}/feats_type)"
+            if [ "${_feats_type}" = raw ]; then
+                _scp=wav.scp
+                if [[ "${audio_format}" == *ark* ]]; then
+                    _type=kaldi_ark
+                else
+                    _type=sound
+                fi
+            else
+                _scp=feats.scp
+                _type=kaldi_ark
+            fi
+
+            # 1. Split the key file
+            key_file=${_data}/${_scp}
+            split_scps=""
+            _nj=$(min "${inference_nj}" "$(<${key_file} wc -l)")
+            if "${use_streaming}"; then
+                st_inference_tool="espnet2.bin.st_inference_streaming"
+            else
+                st_inference_tool="espnet2.bin.st_inference"
+            fi
+
+            for n in $(seq "${_nj}"); do
+                split_scps+=" ${_logdir}/keys.${n}.scp"
+            done
+            # shellcheck disable=SC2086
+            utils/split_scp.pl "${key_file}" ${split_scps}
+
+            # 2. Submit decoding jobs
+            log "Decoding started... log: '${_logdir}/st_inference.*.log'"
+            # shellcheck disable=SC2086
+            ${_cmd} --gpu "${_ngpu}" JOB=1:"${_nj}" "${_logdir}"/st_inference.JOB.log \
+                ${python} -m ${st_inference_tool} \
+                    --batch_size ${batch_size} \
+                    --ngpu "${_ngpu}" \
+                    --data_path_and_name_and_type "${_data}/${_scp},speech,${_type}" \
+                    --key_file "${_logdir}"/keys.JOB.scp \
+                    --st_train_config "${st_exp}"/config.yaml \
+                    --st_model_file "${st_exp}"/"${inference_st_model}" \
+                    --output_dir "${_logdir}"/output.JOB \
+                    ${_opts} ${inference_args}
+
+            # 3. Concatenates the output files from each jobs
+            for f in token token_int score text; do
+                for i in $(seq "${_nj}"); do
+                    cat "${_logdir}/output.${i}/1best_recog/${f}"
+                done | LC_ALL=C sort -k1 >"${_dir}/${f}"
+            done
+        done
+    fi
+
+    if [ ${stage} -le 13 ] && [ ${stop_stage} -ge 13 ]; then
+        log "Stage 13: Scoring"
+
+        for dset in ${test_sets}; do
+            _data="${data_feats}/${dset}"
+            _dir="${st_exp}/${inference_tag}/${dset}"
+
+            # TODO(jiatong): add asr scoring and inference
+
+            _scoredir="${_dir}/score_bleu"
+            mkdir -p "${_scoredir}"
+
+            paste \
+                <(<"${_data}/text.${tgt_case}.${tgt_lang}" \
+                    ${python} -m espnet2.bin.tokenize_text  \
+                        -f 2- --input - --output - \
+                        --token_type word \
+                        --non_linguistic_symbols "${nlsyms_txt}" \
+                        --remove_non_linguistic_symbols true \
+                        --cleaner "${cleaner}" \
+                        ) \
+                <(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \
+                    >"${_scoredir}/ref.trn.org"
+
+            # NOTE(kamo): Don't use cleaner for hyp
+            paste \
+                <(<"${_dir}/text"  \
+                        ${python} -m espnet2.bin.tokenize_text  \
+                            -f 2- --input - --output - \
+                            --token_type word \
+                            --non_linguistic_symbols "${nlsyms_txt}" \
+                            --remove_non_linguistic_symbols true \
+                            ) \
+                <(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \
+                    >"${_scoredir}/hyp.trn.org"
+            
+            # remove utterance id
+            perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/ref.trn.org" > "${_scoredir}/ref.trn"
+            perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/hyp.trn.org" > "${_scoredir}/hyp.trn"
+
+            # detokenizer
+            detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/ref.trn" > "${_scoredir}/ref.trn.detok"
+            detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/hyp.trn" > "${_scoredir}/hyp.trn.detok"
+
+            if [ ${tgt_case} = "tc" ]; then
+                echo "Case sensitive BLEU result (single-reference)" >> ${_scoredir}/result.tc.txt
+                sacrebleu "${_scoredir}/ref.trn.detok" \
+                          -i "${_scoredir}/hyp.trn.detok" \
+                          -m bleu chrf ter \
+                          >> ${_scoredir}/result.tc.txt
+                
+                log "Write a case-sensitive BLEU (single-reference) result in ${_scoredir}/result.tc.txt"
+            fi
+
+            # detokenize & remove punctuation except apostrophe
+            remove_punctuation.pl < "${_scoredir}/ref.trn.detok" > "${_scoredir}/ref.trn.detok.lc.rm"
+            remove_punctuation.pl < "${_scoredir}/hyp.trn.detok" > "${_scoredir}/hyp.trn.detok.lc.rm"
+            echo "Case insensitive BLEU result (single-reference)" >> ${_scoredir}/result.lc.txt
+            sacrebleu -lc "${_scoredir}/ref.trn.detok.lc.rm" \
+                      -i "${_scoredir}/hyp.trn.detok.lc.rm" \
+                      -m bleu chrf ter \
+                      >> ${_scoredir}/result.lc.txt
+            log "Write a case-insensitve BLEU (single-reference) result in ${_scoredir}/result.lc.txt"
+
+            # process multi-references cases
+            multi_references=$(ls "${_data}/text.${tgt_case}.${tgt_lang}".* || echo "")
+            if [ "${multi_references}" != "" ]; then
+                case_sensitive_refs=""
+                case_insensitive_refs=""
+                for multi_reference in ${multi_references}; do
+                    ref_idx="${multi_reference##*.}"
+                    paste \
+                        <(<${multi_reference} \
+                            ${python} -m espnet2.bin.tokenize_text  \
+                                -f 2- --input - --output - \
+                                --token_type word \
+                                --non_linguistic_symbols "${nlsyms_txt}" \
+                                --remove_non_linguistic_symbols true \
+                                --cleaner "${cleaner}" \
+                                ) \
+                        <(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \
+                            >"${_scoredir}/ref.trn.org.${ref_idx}"
+                    
+                    # 
+                    perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/ref.trn.org.${ref_idx}" > "${_scoredir}/ref.trn.${ref_idx}"
+                    detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/ref.trn.${ref_idx}" > "${_scoredir}/ref.trn.detok.${ref_idx}"
+                    remove_punctuation.pl < "${_scoredir}/ref.trn.detok.${ref_idx}" > "${_scoredir}/ref.trn.detok.lc.rm.${ref_idx}"
+                    case_sensitive_refs="${case_sensitive_refs} ${_scoredir}/ref.trn.detok.${ref_idx}"
+                    case_insensitive_refs="${case_insensitive_refs} ${_scoredir}/ref.trn.detok.lc.rm.${ref_idx}"
+                done
+
+                if [ ${tgt_case} = "tc" ]; then
+                    echo "Case sensitive BLEU result (multi-references)" >> ${_scoredir}/result.tc.txt
+                    sacrebleu ${case_sensitive_refs} \
+                        -i ${_scoredir}/hyp.trn.detok.lc.rm -m bleu chrf ter \
+                        >> ${_scoredir}/result.tc.txt
+                    log "Write a case-sensitve BLEU (multi-reference) result in ${_scoredir}/result.tc.txt"
+                fi
+
+                echo "Case insensitive BLEU result (multi-references)" >> ${_scoredir}/result.lc.txt
+                sacrebleu -lc ${case_insensitive_refs} \
+                    -i ${_scoredir}/hyp.trn.detok.lc.rm -m bleu chrf ter \
+                    >> ${_scoredir}/result.lc.txt
+                log "Write a case-insensitve BLEU (multi-reference) result in ${_scoredir}/result.lc.txt"
+            fi
+        done
+
+        # Show results in Markdown syntax
+        scripts/utils/show_translation_result.sh --case $tgt_case "${st_exp}" > "${st_exp}"/RESULTS.md
+        cat "${st_exp}"/RESULTS.md
+    fi
+else
+    log "Skip the evaluation stages"
+fi
+
+
+packed_model="${st_exp}/${st_exp##*/}_${inference_st_model%.*}.zip"
+if ! "${skip_upload}"; then
+    if [ ${stage} -le 14 ] && [ ${stop_stage} -ge 14 ]; then
+        log "Stage 14: Pack model: ${packed_model}"
+
+        _opts=
+        if "${use_lm}"; then
+            _opts+="--lm_train_config ${lm_exp}/config.yaml "
+            _opts+="--lm_file ${lm_exp}/${inference_lm} "
+            _opts+="--option ${lm_exp}/perplexity_test/ppl "
+            _opts+="--option ${lm_exp}/images "
+        fi
+        if [ "${feats_normalize}" = global_mvn ]; then
+            _opts+="--option ${st_stats_dir}/train/feats_stats.npz "
+        fi
+        if [ "${tgt_token_type}" = bpe ]; then
+            _opts+="--option ${tgt_bpemodel} "
+            _opts+="--option ${src_bpemodel} "
+        fi
+        if [ "${nlsyms_txt}" != none ]; then
+            _opts+="--option ${nlsyms_txt} "
+        fi
+        # shellcheck disable=SC2086
+        ${python} -m espnet2.bin.pack st \
+            --st_train_config "${st_exp}"/config.yaml \
+            --st_model_file "${st_exp}"/"${inference_st_model}" \
+            ${_opts} \
+            --option "${st_exp}"/RESULTS.md \
+            --option "${st_exp}"/RESULTS.md \
+            --option "${st_exp}"/images \
+            --outpath "${packed_model}"
+    fi
+
+
+    if [ ${stage} -le 15 ] && [ ${stop_stage} -ge 15 ]; then
+        log "Stage 15: Upload model to Zenodo: ${packed_model}"
+
+        # To upload your model, you need to do:
+        #   1. Sign up to Zenodo: https://zenodo.org/
+        #   2. Create access token: https://zenodo.org/account/settings/applications/tokens/new/
+        #   3. Set your environment: % export ACCESS_TOKEN="<your token>"
+
+        if command -v git &> /dev/null; then
+            _creator_name="$(git config user.name)"
+            _checkout="
+git checkout $(git show -s --format=%H)"
+
+        else
+            _creator_name="$(whoami)"
+            _checkout=""
+        fi
+        # /some/where/espnet/egs2/foo/st1/ -> foo/st1
+        _task="$(pwd | rev | cut -d/ -f2 | rev)"
+        # foo/st1 -> foo
+        _corpus="${_task%/*}"
+        _model_name="${_creator_name}/${_corpus}_$(basename ${packed_model} .zip)"
+
+        # Generate description file
+        cat << EOF > "${st_exp}"/description
+This model was trained by ${_creator_name} using ${_task} recipe in <a href="https://github.com/espnet/espnet/">espnet</a>.
+<p>&nbsp;</p>
+<ul>
+<li><strong>Python API</strong><pre><code class="language-python">See https://github.com/espnet/espnet_model_zoo</code></pre></li>
+<li><strong>Evaluate in the recipe</strong><pre>
+<code class="language-bash">git clone https://github.com/espnet/espnet
+cd espnet${_checkout}
+pip install -e .
+cd $(pwd | rev | cut -d/ -f1-3 | rev)
+./run.sh --skip_data_prep false --skip_train true --download_model ${_model_name}</code>
+</pre></li>
+<li><strong>Results</strong><pre><code>$(cat "${st_exp}"/RESULTS.md)</code></pre></li>
+<li><strong>ST config</strong><pre><code>$(cat "${st_exp}"/config.yaml)</code></pre></li>
+<li><strong>LM config</strong><pre><code>$(if ${use_lm}; then cat "${lm_exp}"/config.yaml; else echo NONE; fi)</code></pre></li>
+</ul>
+EOF
+
+        # NOTE(kamo): The model file is uploaded here, but not published yet.
+        #   Please confirm your record at Zenodo and publish it by yourself.
+
+        # shellcheck disable=SC2086
+        espnet_model_zoo_upload \
+            --file "${packed_model}" \
+            --title "ESPnet2 pretrained model, ${_model_name}, fs=${fs}, lang=${lang}" \
+            --description_file "${st_exp}"/description \
+            --creator_name "${_creator_name}" \
+            --license "CC-BY-4.0" \
+            --use_sandbox false \
+            --publish false
+    fi
+else
+    log "Skip the uploading stages"
+fi
+
+if ! "${skip_upload_hf}"; then
+    if [ ${stage} -le 16 ] && [ ${stop_stage} -ge 16 ]; then
+        [ -z "${hf_repo}" ] && \
+            log "ERROR: You need to setup the variable hf_repo with the name of the repository located at HuggingFace" && \
+            exit 1
+        log "Stage 16: Upload model to HuggingFace: ${hf_repo}"
+
+        gitlfs=$(git lfs --version 2> /dev/null || true)
+        [ -z "${gitlfs}" ] && \
+            log "ERROR: You need to install git-lfs first" && \
+            exit 1             
+  
+        dir_repo=${expdir}/hf_${hf_repo//"/"/"_"}
+        [ ! -d "${dir_repo}" ] && git clone https://huggingface.co/${hf_repo} ${dir_repo}
+  
+        if command -v git &> /dev/null; then
+            _creator_name="$(git config user.name)"
+            _checkout="git checkout $(git show -s --format=%H)"
+        else
+            _creator_name="$(whoami)"
+            _checkout=""
+        fi
+        # /some/where/espnet/egs2/foo/asr1/ -> foo/asr1
+        _task="$(pwd | rev | cut -d/ -f2 | rev)"
+        # foo/asr1 -> foo
+        _corpus="${_task%/*}"
+        _model_name="${_creator_name}/${_corpus}_$(basename ${packed_model} .zip)"
+  
+        # copy files in ${dir_repo}
+        unzip -o ${packed_model} -d ${dir_repo}
+        # Generate description file
+        # shellcheck disable=SC2034
+        hf_task=speech-translation
+        # shellcheck disable=SC2034     
+        espnet_task=ST
+        # shellcheck disable=SC2034
+        task_exp=${st_exp}
+        eval "echo \"$(cat scripts/utils/TEMPLATE_HF_Readme.md)\"" > "${dir_repo}"/README.md
+
+        this_folder=${PWD}
+        cd ${dir_repo}
+        if [ -n "$(git status --porcelain)" ]; then
+            git add .
+            git commit -m "Update model"
+        fi
+        git push
+        cd ${this_folder}
+    fi
+else
+    log "Skip the uploading to HuggingFace stage"
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/TEMPLATE/st1/steps b/egs2/TEMPLATE/st1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/TEMPLATE/st1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/TEMPLATE/st1/utils b/egs2/TEMPLATE/st1/utils
new file mode 120000
index 00000000000..6d93948f170
--- /dev/null
+++ b/egs2/TEMPLATE/st1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils/
\ No newline at end of file
diff --git a/egs2/TEMPLATE/tts1/README.md b/egs2/TEMPLATE/tts1/README.md
index 43c58ce29e4..a94a6cd5913 100644
--- a/egs2/TEMPLATE/tts1/README.md
+++ b/egs2/TEMPLATE/tts1/README.md
@@ -67,8 +67,11 @@ See also:
 Wav dumping stage.
 This stage reformats `wav.scp` in data directories.
 
-Additionally, we support X-vector extraction in this stage as you can use in ESPnet1.
-If you specify `--use_xvector true` (Default: `use_xvector=false`), we extract mfcc features, vad decision, and X-vector.
+Additionally, We support X-vector extraction in this stage as you can use in ESPnet1.
+If you specify `--use_xvector true` (Default: `use_xvector=false`), we extract X-vectors.
+You can select the type of toolkit to use (kaldi, speechbrain, or espnet) when you specify `--xvector_tool <option>` 
+(Default: `xvector_tool=kaldi`).
+If you specify kaldi, then we additionally extract mfcc features and vad decision.
 This processing requires the compiled kaldi, please be careful.
 
 Also, speaker ID embedding and language ID embedding preparation will be performed in this stage if you specify `--use_sid true` and `--use_lid true` options.
diff --git a/egs2/TEMPLATE/tts1/tts.sh b/egs2/TEMPLATE/tts1/tts.sh
index b861cd570a5..0bd2e0debb8 100755
--- a/egs2/TEMPLATE/tts1/tts.sh
+++ b/egs2/TEMPLATE/tts1/tts.sh
@@ -50,7 +50,6 @@ feats_type=raw             # Input feature type.
 audio_format=flac          # Audio format: wav, flac, wav.ark, flac.ark  (only in feats_type=raw).
 min_wav_duration=0.1       # Minimum duration in second.
 max_wav_duration=20        # Maximum duration in second.
-use_xvector=false          # Whether to use x-vector (Require Kaldi).
 use_sid=false              # Whether to use speaker id as the inputs (Need utt2spk in data directory).
 use_lid=false              # Whether to use language id as the inputs (Need utt2lang in data directory).
 feats_extract=fbank        # On-the-fly feature extractor.
@@ -66,6 +65,11 @@ n_mels=80                  # The number of mel basis.
 f0min=80  # Maximum f0 for pitch extraction.
 f0max=400 # Minimum f0 for pitch extraction.
 
+# X-Vector related
+use_xvector=false   # Whether to use x-vector.
+xvector_tool=kaldi  # Toolkit for extracting x-vector (speechbrain, espnet, kaldi)
+xvector_model=speechbrain/spkrec-ecapa-voxceleb  # For only espnet or speechbrain
+
 # Vocabulary related
 oov="<unk>"         # Out of vocabrary symbol.
 blank="<blank>"     # CTC blank symbol.
@@ -141,7 +145,9 @@ Options:
     --audio_format     # Audio format: wav, flac, wav.ark, flac.ark  (only in feats_type=raw, default="${audio_format}").
     --min_wav_duration # Minimum duration in second (default="${min_wav_duration}").
     --max_wav_duration # Maximum duration in second (default="${max_wav_duration}").
-    --use_xvector      # Whether to use X-vector (Require Kaldi, default="${use_xvector}").
+    --use_xvector      # Whether to use X-vector (default="${use_xvector}").
+    --xvector_tool     # Toolkit for generating the X-vectors (default="${xvector_tool}").
+    --xvector_model    # Pretrained model to generate the X-vectors (default="${xvector_model}").
     --use_sid          # Whether to use speaker id as the inputs (default="${use_sid}").
     --use_lid          # Whether to use language id as the inputs (default="${use_lid}").
     --feats_extract    # On the fly feature extractor (default="${feats_extract}").
@@ -336,60 +342,77 @@ if ! "${skip_data_prep}"; then
 
         # Extract X-vector
         if "${use_xvector}"; then
-            log "Stage 2+: Extract X-vector: data/ -> ${dumpdir}/xvector (Require Kaldi)"
-            # Download X-vector pretrained model
-            xvector_exp=${expdir}/xvector_nnet_1a
-            if [ ! -e "${xvector_exp}" ]; then
-                log "X-vector model does not exist. Download pre-trained model."
-                wget http://kaldi-asr.org/models/8/0008_sitw_v2_1a.tar.gz
-                tar xvf 0008_sitw_v2_1a.tar.gz
-                [ ! -e "${expdir}" ] && mkdir -p "${expdir}"
-                mv 0008_sitw_v2_1a/exp/xvector_nnet_1a "${xvector_exp}"
-                rm -rf 0008_sitw_v2_1a.tar.gz 0008_sitw_v2_1a
-            fi
-
-            # Generate the MFCC features, VAD decision, and X-vector
-            for dset in "${train_set}" "${valid_set}" ${test_sets}; do
-                # 1. Copy datadir and resample to 16k
-                utils/copy_data_dir.sh "data/${dset}" "${dumpdir}/mfcc/${dset}"
-                utils/data/resample_data_dir.sh 16000 "${dumpdir}/mfcc/${dset}"
-
-                # 2. Extract mfcc features
-                _nj=$(min "${nj}" "$(<${dumpdir}/mfcc/${dset}/utt2spk wc -l)")
-                steps/make_mfcc.sh --nj "${_nj}" --cmd "${train_cmd}" \
-                    --write-utt2num-frames true \
-                    --mfcc-config conf/mfcc.conf \
-                    "${dumpdir}/mfcc/${dset}"
-                utils/fix_data_dir.sh "${dumpdir}/mfcc/${dset}"
-
-                # 3. Compute VAD decision
-                _nj=$(min "${nj}" "$(<${dumpdir}/mfcc/${dset}/spk2utt wc -l)")
-                sid/compute_vad_decision.sh --nj ${_nj} --cmd "${train_cmd}" \
-                    --vad-config conf/vad.conf \
-                    "${dumpdir}/mfcc/${dset}"
-                utils/fix_data_dir.sh "${dumpdir}/mfcc/${dset}"
-
-                # 4. Extract X-vector
-                sid/nnet3/xvector/extract_xvectors.sh --nj "${_nj}" --cmd "${train_cmd}" \
-                    "${xvector_exp}" \
-                    "${dumpdir}/mfcc/${dset}" \
-                    "${dumpdir}/xvector/${dset}"
-
-                # 5. Filter scp
-                # NOTE(kan-bayashi): Since sometimes mfcc or x-vector extraction is failed,
-                #   the number of utts will be different from the original features (raw or fbank).
-                #   To avoid this mismatch, perform filtering of the original feature scp here.
-                if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
-                    _suf="/org"
-                else
-                    _suf=""
+            if [ "${xvector_tool}" = "kaldi" ]; then
+                log "Stage 2+: Extract X-vector: data/ -> ${dumpdir}/xvector (Require Kaldi)"
+                # Download X-vector pretrained model
+                xvector_exp=${expdir}/xvector_nnet_1a
+                if [ ! -e "${xvector_exp}" ]; then
+                    log "X-vector model does not exist. Download pre-trained model."
+                    wget http://kaldi-asr.org/models/8/0008_sitw_v2_1a.tar.gz
+                    tar xvf 0008_sitw_v2_1a.tar.gz
+                    [ ! -e "${expdir}" ] && mkdir -p "${expdir}"
+                    mv 0008_sitw_v2_1a/exp/xvector_nnet_1a "${xvector_exp}"
+                    rm -rf 0008_sitw_v2_1a.tar.gz 0008_sitw_v2_1a
                 fi
-                cp "${data_feats}${_suf}/${dset}"/wav.{scp,scp.bak}
-                <"${data_feats}${_suf}/${dset}/wav.scp.bak" \
-                    utils/filter_scp.pl "${dumpdir}/xvector/${dset}/xvector.scp" \
-                    >"${data_feats}${_suf}/${dset}/wav.scp"
-                utils/fix_data_dir.sh "${data_feats}${_suf}/${dset}"
-            done
+
+                # Generate the MFCC features, VAD decision, and X-vector
+                for dset in "${train_set}" "${valid_set}" ${test_sets}; do
+                    # 1. Copy datadir and resample to 16k
+                    utils/copy_data_dir.sh "data/${dset}" "${dumpdir}/mfcc/${dset}"
+                    utils/data/resample_data_dir.sh 16000 "${dumpdir}/mfcc/${dset}"
+
+                    # 2. Extract mfcc features
+                    _nj=$(min "${nj}" "$(<${dumpdir}/mfcc/${dset}/utt2spk wc -l)")
+                    steps/make_mfcc.sh --nj "${_nj}" --cmd "${train_cmd}" \
+                        --write-utt2num-frames true \
+                        --mfcc-config conf/mfcc.conf \
+                        "${dumpdir}/mfcc/${dset}"
+                    utils/fix_data_dir.sh "${dumpdir}/mfcc/${dset}"
+
+                    # 3. Compute VAD decision
+                    _nj=$(min "${nj}" "$(<${dumpdir}/mfcc/${dset}/spk2utt wc -l)")
+                    sid/compute_vad_decision.sh --nj ${_nj} --cmd "${train_cmd}" \
+                        --vad-config conf/vad.conf \
+                        "${dumpdir}/mfcc/${dset}"
+                    utils/fix_data_dir.sh "${dumpdir}/mfcc/${dset}"
+
+                    # 4. Extract X-vector
+                    sid/nnet3/xvector/extract_xvectors.sh --nj "${_nj}" --cmd "${train_cmd}" \
+                        "${xvector_exp}" \
+                        "${dumpdir}/mfcc/${dset}" \
+                        "${dumpdir}/xvector/${dset}"
+
+                    # 5. Filter scp
+                    # NOTE(kan-bayashi): Since sometimes mfcc or x-vector extraction is failed,
+                    #   the number of utts will be different from the original features (raw or fbank).
+                    #   To avoid this mismatch, perform filtering of the original feature scp here.
+                    if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
+                        _suf="/org"
+                    else
+                        _suf=""
+                    fi
+                    cp "${data_feats}${_suf}/${dset}"/wav.{scp,scp.bak}
+                    <"${data_feats}${_suf}/${dset}/wav.scp.bak" \
+                        utils/filter_scp.pl "${dumpdir}/xvector/${dset}/xvector.scp" \
+                        >"${data_feats}${_suf}/${dset}/wav.scp"
+                    utils/fix_data_dir.sh "${data_feats}${_suf}/${dset}"
+                done
+            else
+                # Assume that others toolkits are python-based
+                log "Stage 2+: Extract X-vector: data/ -> ${dumpdir}/xvector using python toolkits"
+                for dset in "${train_set}" "${valid_set}" ${test_sets}; do
+                    if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
+                        _suf="/org"
+                    else
+                        _suf=""
+                    fi
+                    pyscripts/utils/extract_xvectors.py \
+                        --pretrained_model ${xvector_model} \
+                        --toolkit ${xvector_tool} \
+                        ${data_feats}${_suf}/${dset} \
+                        ${dumpdir}/xvector/${dset}
+                done
+            fi
         fi
 
         # Prepare spk id input
@@ -679,8 +702,12 @@ if ! "${skip_train}"; then
             #     CASE 1: AR model training     #
             #####################################
             _scp=wav.scp
-            # "sound" supports "wav", "flac", etc.
-            _type=sound
+            if [[ "${audio_format}" == *ark* ]]; then
+                _type=kaldi_ark
+            else
+                # "sound" supports "wav", "flac", etc.
+                _type=sound
+            fi
             _fold_length="$((speech_fold_length * n_shift))"
             _opts+="--feats_extract ${feats_extract} "
             _opts+="--feats_extract_conf n_fft=${n_fft} "
@@ -757,7 +784,12 @@ if ! "${skip_train}"; then
             else
                 # Teacher forcing case: use groundtruth as the target
                 _scp=wav.scp
-                _type=sound
+                if [[ "${audio_format}" == *ark* ]]; then
+                    _type=kaldi_ark
+                else
+                    # "sound" supports "wav", "flac", etc.
+                    _type=sound
+                fi
                 _fold_length="$((speech_fold_length * n_shift))"
                 _opts+="--feats_extract ${feats_extract} "
                 _opts+="--feats_extract_conf n_fft=${n_fft} "
diff --git a/egs2/accented_french_openslr57/asr1/README.md b/egs2/accented_french_openslr57/asr1/README.md
new file mode 100644
index 00000000000..da3966b0197
--- /dev/null
+++ b/egs2/accented_french_openslr57/asr1/README.md
@@ -0,0 +1,36 @@
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+# RESULTS
+## Environments
+- date: `Sat Apr 16 14:14:45 EDT 2022`
+- python version: `3.9.12 (main, Apr  5 2022, 06:56:58)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.6a1`
+- pytorch version: `pytorch 1.11.0+cu102`
+- Git hash: `f6cbc61353e0a1cefe81ae596278f7db1f0b7dd9`
+  - Commit date: `Fri Apr 15 18:31:26 2022 -0400`
+- Model on HuggingFace repository : https://huggingface.co/espnet/accented_french_openslr57_ASR_transformer
+
+## asr_transformer_baseline
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave_10best/devtest|481|3172|97.4|1.6|1.0|0.2|2.8|15.0|
+|decode_asr_asr_model_valid.acc.ave_10best/test|515|2941|85.2|13.4|1.3|9.1|23.9|58.4|
+
+Results are better for the devtest set as the sets are composed of very different accents. 
+Indeed, devtest is only made of Gbon accent while the test set is a mix of Gabon and Cameroon.
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave_10best/devtest|481|16205|98.7|0.2|1.1|0.2|1.5|15.0|
+|decode_asr_asr_model_valid.acc.ave_10best/test|515|16233|95.8|2.0|2.2|2.1|6.3|58.4|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave_10best/devtest|481|7555|98.1|0.6|1.4|0.3|2.2|15.0|
+|decode_asr_asr_model_valid.acc.ave_10best/test|515|7998|88.9|6.7|4.5|1.3|12.4|58.4|
+
diff --git a/egs2/accented_french_openslr57/asr1/asr.sh b/egs2/accented_french_openslr57/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/accented_french_openslr57/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/accented_french_openslr57/asr1/cmd.sh b/egs2/accented_french_openslr57/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/accented_french_openslr57/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/accented_french_openslr57/asr1/conf/decode_asr.yaml b/egs2/accented_french_openslr57/asr1/conf/decode_asr.yaml
new file mode 100644
index 00000000000..f8b1f2bbc4d
--- /dev/null
+++ b/egs2/accented_french_openslr57/asr1/conf/decode_asr.yaml
@@ -0,0 +1,6 @@
+lm_weight: 1.0
+beam_size: 20
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.3
diff --git a/egs2/accented_french_openslr57/asr1/conf/fbank.conf b/egs2/accented_french_openslr57/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/accented_french_openslr57/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/accented_french_openslr57/asr1/conf/pbs.conf b/egs2/accented_french_openslr57/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/accented_french_openslr57/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/accented_french_openslr57/asr1/conf/pitch.conf b/egs2/accented_french_openslr57/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/accented_french_openslr57/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/accented_french_openslr57/asr1/conf/queue.conf b/egs2/accented_french_openslr57/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/accented_french_openslr57/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/accented_french_openslr57/asr1/conf/slurm.conf b/egs2/accented_french_openslr57/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/accented_french_openslr57/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/accented_french_openslr57/asr1/conf/train_lm_transformer.yaml b/egs2/accented_french_openslr57/asr1/conf/train_lm_transformer.yaml
new file mode 100644
index 00000000000..ace0739a939
--- /dev/null
+++ b/egs2/accented_french_openslr57/asr1/conf/train_lm_transformer.yaml
@@ -0,0 +1,29 @@
+lm: transformer
+lm_conf:
+    pos_enc: null
+    embed_unit: 128
+    att_unit: 512
+    head: 8
+    unit: 2048
+    layer: 16
+    dropout_rate: 0.1
+
+# optimization related
+grad_clip: 5.0
+batch_type: numel
+batch_bins: 2000000
+accum_grad: 1
+max_epoch: 15  # 15epoch is enougth
+
+optim: adam
+optim_conf:
+   lr: 0.001
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 25000
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 10  # 10 is good.
diff --git a/egs2/accented_french_openslr57/asr1/conf/tuning/train_asr_hubert_frontend.yaml b/egs2/accented_french_openslr57/asr1/conf/tuning/train_asr_hubert_frontend.yaml
new file mode 100644
index 00000000000..f7879a91459
--- /dev/null
+++ b/egs2/accented_french_openslr57/asr1/conf/tuning/train_asr_hubert_frontend.yaml
@@ -0,0 +1,80 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: "conv2d2"
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+
+# decoder related
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: "embed"
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+
+frontend: s3prl
+frontend_conf:
+  frontend_conf:
+      upstream: hubert_large_ll60k  # Note: If the upstream is changed, please change the input_size in the preencoder  
+  download_dir: ./hub
+  multilayer_feature: True
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 80
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# minibatch related
+batch_type: folded
+batch_size: 16
+
+# optimization related
+optim: adam
+accum_grad: 4
+grad_clip: 5
+patience: 15
+max_epoch: 100
+optim_conf:
+    lr: 1
+scheduler: noamlr
+scheduler_conf:
+    warmup_steps: 4000
+
+# others:
+best_model_criterion:
+-  - valid
+   - acc
+   - max
+keep_nbest_models: 10
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+init: xavier_uniform
diff --git a/egs2/accented_french_openslr57/asr1/conf/tuning/train_asr_transformer.yaml b/egs2/accented_french_openslr57/asr1/conf/tuning/train_asr_transformer.yaml
new file mode 100644
index 00000000000..43b6772edb3
--- /dev/null
+++ b/egs2/accented_french_openslr57/asr1/conf/tuning/train_asr_transformer.yaml
@@ -0,0 +1,67 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: "conv2d2"
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+
+# decoder related
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: "embed"
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# minibatch related
+batch_type: folded
+batch_size: 8
+
+# optimization related
+optim: adam
+accum_grad: 1
+grad_clip: 5
+patience: 15
+max_epoch: 100
+optim_conf:
+    lr: 1
+scheduler: noamlr
+scheduler_conf:
+    warmup_steps: 4000
+
+# others:
+best_model_criterion:
+-  - valid
+   - acc
+   - max
+keep_nbest_models: 10
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+init: xavier_uniform
diff --git a/egs2/accented_french_openslr57/asr1/db.sh b/egs2/accented_french_openslr57/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/accented_french_openslr57/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/accented_french_openslr57/asr1/local/data.sh b/egs2/accented_french_openslr57/asr1/local/data.sh
new file mode 100755
index 00000000000..950ae6a355d
--- /dev/null
+++ b/egs2/accented_french_openslr57/asr1/local/data.sh
@@ -0,0 +1,245 @@
+#!/bin/bash
+
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+# general configuration
+stage=0       # start from 0 if you need to start from data preparation
+stop_stage=100
+# inclusive, was 100
+SECONDS=0
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+. utils/parse_options.sh
+
+mkdir -p ${ACCENTED_FR}
+if [ -z "${ACCENTED_FR}" ]; then
+    log "Fill the value of 'ACCENTED_FR' of db.sh"
+    exit 1
+fi
+
+log "data preparation started"
+
+
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "sub-stage 0: Download Data to downloads"
+
+    (
+    cd ${ACCENTED_FR}
+    wget https://www.openslr.org/resources/57/African_Accented_French.tar.gz
+    tar -xvf African_Accented_French.tar.gz
+    rm -r African_Accented_French.tar.gz
+    )
+
+fi
+
+# some samples are missing (less than 0.1%), we use the following scripts to clean the datasets
+python3 local/normalize_test.py --path_test "${ACCENTED_FR}/African_Accented_French/transcripts/test/ca16/"
+python3 local/remove_missing.py --folder "downloads/African_Accented_French/" --train "transcripts/train/" --devtest "transcripts/devtest/"
+
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "sub-stage 1: Preparing Data for train"
+
+    # train set ca16_conv: 
+    FILE=${ACCENTED_FR}/African_Accented_French/transcripts/train/ca16_conv/new_transcripts.txt
+
+    # .split('_') starts at 1, put in aux file, aux is the folder
+    cut -d '_' -f 3 "$FILE" > aux5
+    cut -d ' ' -f 1 "$FILE" > uttid1
+    cut -c -32 uttid1 > uttid2
+    cut -d ' ' -f 2- "$FILE" > aux10
+
+    paste -d ' ' uttid2 aux10 > auxtext1
+
+    # take everything in aux $0, add that before to the $0 -> aux2
+    awk '{print "downloads/African_Accented_French/speech/train/ca16/"$0}' aux5 > aux6
+
+    # aux2/uttid -> aux3
+    paste -d "/"  aux6 uttid2 > aux7
+    awk '{print $0".wav"}' aux7 > aux8
+    paste  -d " " uttid2 aux8 > auxwav1
+
+    # identity function
+    paste  -d " " uttid2 uttid2  > auxutt1
+
+    # train set ca16_conv: 
+    FILE=${ACCENTED_FR}/African_Accented_French/transcripts/train/ca16_read/new_conditioned.txt
+
+    # .split('_') starts at 1, put in aux file, aux is the folder
+    cut -d '_' -f 3 "$FILE" > aux5
+    cut -d ' ' -f 1 "$FILE" > uttid1
+
+    # take everything in aux $0, add that before to the $0 -> aux2
+    awk '{print "downloads/African_Accented_French/speech/train/ca16/"$0}' aux5 > aux6
+
+    # aux2/uttid -> aux3
+    paste -d "/"  aux6 uttid1 > aux7
+    awk '{print $0".wav"}' aux7 > aux8
+    paste  -d " " uttid1 aux8 > auxwav2
+
+    # identity function
+    paste  -d " " uttid1 uttid1 > auxutt2
+
+    # train yaounde read
+    head -6299 ${ACCENTED_FR}/African_Accented_French/transcripts/train/yaounde/fn_text.txt > d_aux
+    tail -n +2 d_aux > ${ACCENTED_FR}/African_Accented_French/transcripts/train/yaounde/fn_read_text.txt
+    FILE=${ACCENTED_FR}/African_Accented_French/transcripts/train/yaounde/fn_read_text.txt
+
+    # .split('_') starts at 1, put in aux file, aux is the folder
+    cut -d '/' -f 9 "$FILE" > aux5
+    cut -d ' ' -f 1 aux5 > aux6
+
+    cut -d '/' -f 8-9 "$FILE" > aux9
+    cut -d ' ' -f 1 aux9 > aux10
+
+    cut -c -14 aux6 > aux7
+    awk '{print "read-"$0}' aux7 > uttid3
+
+    cut -d ' ' -f 2- "$FILE" > aux8
+    paste -d ' ' uttid3 aux8 > auxtext3
+
+    awk '{print "downloads/African_Accented_French/speech/train/yaounde/read/"$0}' aux10 > aux11
+    paste -d ' ' uttid3 aux11 > auxwav3
+
+    paste -d ' ' uttid3 uttid3 > auxutt3
+
+    # train yaounde answers
+    tail -2098 ${ACCENTED_FR}/African_Accented_French/transcripts/train/yaounde/fn_text.txt >  ${ACCENTED_FR}/African_Accented_French/transcripts/train/yaounde/fn_answers_text.txt
+    FILE=${ACCENTED_FR}/African_Accented_French/transcripts/train/yaounde/fn_answers_text.txt
+
+    # .split('_') starts at 1, put in aux file, aux is the folder
+    cut -d '/' -f 8 "$FILE" > aux5
+    cut -d ' ' -f 1 aux5 > aux6
+
+    cut -d '/' -f 7-8 "$FILE" > aux9
+    cut -d ' ' -f 1 aux9 > aux10
+
+    cut -c -13 aux6 > aux7
+    awk '{print "answers-"$0}' aux7 > uttid4
+
+    cut -d ' ' -f 2- "$FILE" > aux8
+    paste -d ' ' uttid4 aux8 > auxtext4
+
+    awk '{print "downloads/African_Accented_French/speech/train/yaounde/answers/"$0}' aux10 > aux11
+    paste -d ' ' uttid4 aux11 > auxwav4
+
+    paste -d ' ' uttid4 uttid4 > auxutt4
+
+    # cat everything
+    mkdir -p data/train
+
+    cat auxtext1 ${ACCENTED_FR}/African_Accented_French/transcripts/train/ca16_read/new_conditioned.txt auxtext3 auxtext4 > data/train/text
+    cat auxwav1 auxwav2 auxwav3 auxwav4 > data/train/wav.scp
+    cat auxutt1 auxutt2 auxutt3 auxutt4 > data/train/utt2spk
+    utils/utt2spk_to_spk2utt.pl data/train/utt2spk > data/train/spk2utt
+
+    ./utils/fix_data_dir.sh data/train/
+
+    rm d_aux aux5 aux6 aux7 aux8 aux9 aux10 aux11 auxtext1 auxtext3 auxtext4 auxwav1 auxwav2 auxwav3 auxwav4 auxutt1 auxutt2 auxutt3 auxutt4 uttid1 uttid2 uttid3 uttid4
+fi
+
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "sub-stage 2: Preparing Data for dev"
+    mkdir -p data/dev
+    
+    FILE=${ACCENTED_FR}/African_Accented_French/transcripts/dev/niger_west_african_fr/transcripts.txt
+
+    cut -d '/' -f 3 "$FILE" > aux5
+    cut -d ' ' -f 1 aux5 > aux6
+
+    cut -d '/' -f 2-3 "$FILE" > aux9 
+    cut -d ' ' -f 1 aux9 > aux10
+
+    cut -c -16 aux6 > uttid
+
+    cut -d ' ' -f 2- "$FILE" > aux8
+    paste -d ' ' uttid aux8 > data/dev/text 
+
+    awk '{print "downloads/African_Accented_French/speech/dev/niger_west_african_fr/"$0}' aux10 > aux11
+    paste -d ' ' uttid aux11 > data/dev/wav.scp 
+
+    paste -d ' ' uttid uttid > data/dev/utt2spk
+
+    utils/utt2spk_to_spk2utt.pl data/dev/utt2spk > data/dev/spk2utt
+
+    ./utils/fix_data_dir.sh data/dev/
+
+    rm aux5 aux6 aux9 aux10 uttid aux8 aux11
+
+fi
+
+# test; normalization of the test set is done in normalize_test.py
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    log "sub-stage 3: Preparing Data for test"
+    mkdir -p data/test
+    FILE=${ACCENTED_FR}/African_Accented_French/transcripts/test/ca16/new_prompts.txt
+
+    cp "$FILE" data/test/text
+
+    cut -d ' ' -f 1 "$FILE" > aux5
+    cut -d '_' -f 1-3 aux5 > aux6
+
+    awk '{print "downloads/African_Accented_French/speech/test/ca16/"$0"/"}' aux6 > aux7
+    paste -d '' aux7 aux5 > aux8
+    awk '{print $0".wav"}' aux8 > aux9
+    paste -d ' ' aux5 aux9 > data/test/wav.scp
+
+    paste -d ' ' aux5 aux5 > data/test/utt2spk
+
+    utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt
+
+    ./utils/fix_data_dir.sh data/test/
+
+    rm aux5 aux6 aux7 aux8 aux9
+
+fi
+
+
+# devtest
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    log "sub-stage 4: Preparing Data for devtest"
+    mkdir -p data/devtest
+
+    FILE=${ACCENTED_FR}/African_Accented_French/transcripts/devtest/ca16_read/new_conditioned.txt
+
+    # .split('_') starts at 1, put in aux file, aux is the folder
+    cut -d '_' -f 3 "$FILE" > aux
+    cut -d ' ' -f 1 "$FILE" > uttid
+
+    # take everything in aux $0, add that before to the $0 -> aux2
+    awk '{print "downloads/African_Accented_French/speech/devtest/ca16/"$0}' aux > aux2
+
+    # aux2/uttid -> aux3
+    paste -d "/"  aux2 uttid > aux3
+    awk '{print $0".wav"}' aux3 > aux4
+    paste  -d " " uttid aux4 > data/devtest/wav.scp
+
+    cp ${ACCENTED_FR}/African_Accented_French/transcripts/devtest/ca16_read/new_conditioned.txt data/devtest/text
+
+    # identity function
+    paste  -d " " uttid uttid  > data/devtest/utt2spk
+    utils/utt2spk_to_spk2utt.pl data/devtest/utt2spk > data/devtest/spk2utt
+
+    ./utils/fix_data_dir.sh data/devtest/
+    rm aux aux2 aux3 aux4 uttid
+fi
+
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
\ No newline at end of file
diff --git a/egs2/accented_french_openslr57/asr1/local/normalize_test.py b/egs2/accented_french_openslr57/asr1/local/normalize_test.py
new file mode 100644
index 00000000000..6b13080c912
--- /dev/null
+++ b/egs2/accented_french_openslr57/asr1/local/normalize_test.py
@@ -0,0 +1,38 @@
+# This script is normalizing the test set :
+# It removes punctuation as it is not needed for ASR task
+# It also removes capital letters.
+# This is not the case for the other sets (train, dev and devtest)
+# Those sets are already normalized
+
+# Example :
+# Before : Oui, qu'est-ce que vous voulez?
+# After : oui qu'est-ce que vous voulez
+
+import argparse
+
+parser = argparse.ArgumentParser(description="Normalize test text.")
+parser.add_argument("--path_test", type=str, help="path of test text file")
+
+
+def main(cmd=None):
+    args = parser.parse_args(cmd)
+
+    path = args.path_test
+    f = open(path + "prompts.txt")
+
+    new_f = open(path + "new_prompts.txt", "w")
+
+    for row in f:
+        uttid = row.split(" ")[0]
+        utt = " ".join(row.split(" ")[1:])
+        utt = utt.split("\n")[0]
+        utt = utt.lower()
+        utt = utt.strip(".?!")
+        utt = utt.replace(",", "")
+        utt = utt.replace(";", "")
+
+        new_f.write(uttid + " " + utt + "\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs2/accented_french_openslr57/asr1/local/path.sh b/egs2/accented_french_openslr57/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/accented_french_openslr57/asr1/local/remove_missing.py b/egs2/accented_french_openslr57/asr1/local/remove_missing.py
new file mode 100644
index 00000000000..937144f75d8
--- /dev/null
+++ b/egs2/accented_french_openslr57/asr1/local/remove_missing.py
@@ -0,0 +1,63 @@
+# This script removes very few examples (less than 0.1%) of the train and devtest sets
+# Those few examples contained corrupted utterance Id or empty transcripts.
+
+import argparse
+import os
+
+
+parser = argparse.ArgumentParser(description="Normalize test text.")
+parser.add_argument("--folder", type=str, help="path of download folder")
+parser.add_argument("--train", type=str, help="path of train folder")
+parser.add_argument("--devtest", type=str, help="path of devtest folder")
+
+
+def main(cmd=None):
+    args = parser.parse_args(cmd)
+
+    base = args.folder
+    train = args.train
+    devtest = args.devtest
+
+    existing = []
+    for _, _, f in os.walk(base + "speech/train/ca16"):
+        for fi in f:
+            existing.append(fi[:-4])
+
+    old_f = open(base + train + "ca16_conv/transcripts.txt")
+    new_f = open(
+        base + train + "ca16_conv/new_transcripts.txt",
+        "w",
+    )
+
+    for row in old_f:
+        if row.split(" ")[0][:-4] in existing:
+            new_f.write(row)
+
+    old_f = open(base + train + "ca16_read/conditioned.txt")
+    new_f = open(
+        base + train + "ca16_read/new_conditioned.txt",
+        "w",
+    )
+
+    for row in old_f:
+        if row.split(" ")[0] in existing:
+            new_f.write(row)
+
+    existing = []
+    for _, _, f in os.walk(base + "speech/devtest/ca16"):
+        for fi in f:
+            existing.append(fi[:-4])
+
+    old_f = open(base + devtest + "ca16_read/conditioned.txt")
+    new_f = open(
+        base + devtest + "ca16_read/new_conditioned.txt",
+        "w",
+    )
+
+    for row in old_f:
+        if row.split(" ")[0] in existing:
+            new_f.write(row)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs2/accented_french_openslr57/asr1/path.sh b/egs2/accented_french_openslr57/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/accented_french_openslr57/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/accented_french_openslr57/asr1/pyscripts b/egs2/accented_french_openslr57/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/accented_french_openslr57/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/accented_french_openslr57/asr1/run.sh b/egs2/accented_french_openslr57/asr1/run.sh
new file mode 100755
index 00000000000..d7b381bcfbc
--- /dev/null
+++ b/egs2/accented_french_openslr57/asr1/run.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+
+asr_config=conf/tuning/train_asr_transformer.yaml
+
+inference_config=conf/decode_asr.yaml
+lm_config=conf/train_lm_transformer.yaml
+
+./asr.sh \
+    --ngpu 1 \
+    --use_lm false \
+    --lm_config "${lm_config}" \
+    --token_type bpe \
+    --inference_nj 10 \
+    --nbpe 250 \
+    --nj 32 \
+    --feats_type raw \
+    --inference_asr_model "valid.acc.ave_10best.pth" \
+    --feats_normalize "utterance_mvn" \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "train" \
+    --valid_set "dev" \
+    --test_sets "devtest test" \
+    --bpe_train_text "data/train/text" \
+    --lm_train_text "data/train/text" "$@"
\ No newline at end of file
diff --git a/egs2/accented_french_openslr57/asr1/scripts b/egs2/accented_french_openslr57/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/accented_french_openslr57/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/accented_french_openslr57/asr1/steps b/egs2/accented_french_openslr57/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/accented_french_openslr57/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/accented_french_openslr57/asr1/utils b/egs2/accented_french_openslr57/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/accented_french_openslr57/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/aishell3/tts1/local/data_prep.py b/egs2/aishell3/tts1/local/data_prep.py
index 66c10bf125e..706c28d5642 100644
--- a/egs2/aishell3/tts1/local/data_prep.py
+++ b/egs2/aishell3/tts1/local/data_prep.py
@@ -1,5 +1,6 @@
 import argparse
 import os
+from espnet2.utils.types import str2bool
 
 SPK_LABEL_LEN = 7
 
@@ -7,7 +8,7 @@
     parser = argparse.ArgumentParser()
     parser.add_argument("--src", type=str)
     parser.add_argument("--dest", type=str)
-    parser.add_argument("--external_g2p", type=bool, default=True)
+    parser.add_argument("--external_g2p", type=str2bool, default=True)
 
     args = parser.parse_args()
 
@@ -25,16 +26,16 @@
 
         (wav_name, text_info) = utt_info.strip().split("\t")
         if args.external_g2p:
-            text_info = text_info.split(" ")[::2]
+            text_info = "".join(text_info.split(" ")[::2])
         else:
-            text_info = text_info.split(" ")[1::2]
+            text_info = " ".join(text_info.split(" ")[1::2])
 
         spk_id = wav_name[:SPK_LABEL_LEN]
         utt_id = wav_name[:-4]
 
         wavscp.write("{} {}\n".format(utt_id, os.path.join(wav_dir, spk_id, wav_name)))
         utt2spk.write("{} {}\n".format(utt_id, spk_id))
-        text.write("{} {}\n".format(utt_id, " ".join(text_info)))
+        text.write("{} {}\n".format(utt_id, text_info))
 
     transcript.close()
     wavscp.close()
diff --git a/egs2/aishell4/enh1/cmd.sh b/egs2/aishell4/enh1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/aishell4/enh1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/aishell4/enh1/conf/pbs.conf b/egs2/aishell4/enh1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/aishell4/enh1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/aishell4/enh1/conf/queue.conf b/egs2/aishell4/enh1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/aishell4/enh1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/aishell4/enh1/conf/slurm.conf b/egs2/aishell4/enh1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/aishell4/enh1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/aishell4/enh1/conf/train.yaml b/egs2/aishell4/enh1/conf/train.yaml
new file mode 120000
index 00000000000..6af07d76cb9
--- /dev/null
+++ b/egs2/aishell4/enh1/conf/train.yaml
@@ -0,0 +1 @@
+tuning/train_enh_beamformer_no_wpe.yaml
\ No newline at end of file
diff --git a/egs2/aishell4/enh1/conf/tuning/train_enh_beamformer_no_wpe.yaml b/egs2/aishell4/enh1/conf/tuning/train_enh_beamformer_no_wpe.yaml
new file mode 100644
index 00000000000..ab0c22fc6c3
--- /dev/null
+++ b/egs2/aishell4/enh1/conf/tuning/train_enh_beamformer_no_wpe.yaml
@@ -0,0 +1,74 @@
+optim: adam
+init: xavier_uniform
+max_epoch: 100
+batch_type: folded
+batch_size: 8
+num_workers: 4
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 0
+patience: 4
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: reducelronplateau
+scheduler_conf:
+    mode: min
+    factor: 0.5
+    patience: 1
+
+encoder: stft
+encoder_conf:
+    n_fft: 512
+    hop_length: 128
+    use_builtin_complex: False
+decoder: stft
+decoder_conf:
+    n_fft: 512
+    hop_length: 128
+separator: wpe_beamformer
+separator_conf:
+    num_spk: 2
+    loss_type: mask_mse
+    use_wpe: False
+    wnet_type: blstmp
+    wlayers: 3
+    wunits: 300
+    wprojs: 320
+    wdropout_rate: 0.0
+    taps: 5
+    delay: 3
+    use_dnn_mask_for_wpe: True
+    use_beamformer: True
+    bnet_type: blstmp
+    blayers: 3
+    bunits: 512
+    bprojs: 512
+    badim: 320
+    ref_channel: 0
+    use_noise_mask: False
+    beamformer_type: mvdr
+    bdropout_rate: 0.0
+    shared_power: False
+
+
+criterions: 
+  # The first criterion
+  - name: mse 
+    conf:
+      compute_on_mask: True
+      mask_type: PSM^2
+    # the wrapper for the current criterion
+    # PIT is widely used in the speech separation task
+    wrapper: pit
+    wrapper_conf:
+      weight: 1.0
diff --git a/egs2/aishell4/enh1/db.sh b/egs2/aishell4/enh1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/aishell4/enh1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/aishell4/enh1/enh.sh b/egs2/aishell4/enh1/enh.sh
new file mode 120000
index 00000000000..8fd33b0b191
--- /dev/null
+++ b/egs2/aishell4/enh1/enh.sh
@@ -0,0 +1 @@
+../../TEMPLATE/enh1/enh.sh
\ No newline at end of file
diff --git a/egs2/aishell4/enh1/local/data.sh b/egs2/aishell4/enh1/local/data.sh
new file mode 100755
index 00000000000..494317e5ed8
--- /dev/null
+++ b/egs2/aishell4/enh1/local/data.sh
@@ -0,0 +1,391 @@
+#!/usr/bin/env bash
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+stage=0       # start from 0 if you need to start from data preparation
+stop_stage=100
+nj=32
+outdir=aishell4_simu
+
+. utils/parse_options.sh || exit 1;
+outdir=$(realpath "$outdir")
+
+help_message=$(cat << EOF
+Usage: $0 [--stage <stage>] [--stop_stage <stop_stage>] [--nj <nj>] [--outdir <outdir>]
+
+  optional argument:
+    [--stage]: 0 (default) to 5
+    [--stop_stage]: 0 to 100 (default)
+    [--nj]: number of parallel processes for data simulation
+    [--outdir]: output directory for storing simulated data (default is "aishell4_simu")
+
+  expected directory structure
+        <AISHELL4>
+         |-- test/
+         |   |-- TextGrid/
+         |   |   |-- L_*.rttm
+         |   |   |-- M_*.rttm
+         |   |   |-- S_*.rttm
+         |   |   |-- L_*.TextGrid
+         |   |   |-- M_*.TextGrid
+         |   |   \-- S_*.TextGrid
+         |   \-- wav/
+         |       |-- L_*.flac
+         |       |-- M_*.flac
+         |       \-- S_*.flac
+         |
+         |-- train_L/
+         |   |-- TextGrid/
+         |   |   |-- *_L_*.rttm
+         |   |   \-- *_L_*.TextGrid
+         |   \-- wav/
+         |       \-- *_L_*.flac
+         |
+         |-- train_M/
+         |   |-- TextGrid/
+         |   |   |-- *_M_*.rttm
+         |   |   \-- *_M_*.TextGrid
+         |   \-- wav/
+         |       \-- *_M_*.flac
+         |
+         \-- train_S/
+             |-- TextGrid/
+             |   |-- *_S_*.rttm
+             |   \-- *_S_*.TextGrid
+             \-- wav/
+                 \-- *_S_*.flac
+EOF
+)
+
+if [ $# -gt 0 ]; then
+    log "${help_message}"
+    exit 2
+fi
+
+
+if [ -z "${AISHELL4}" ]; then
+    log "Fill the value of 'AISHELL4' in db.sh"
+    log "(available at https://www.openslr.org/111/)"
+    exit 1
+fi
+
+if [ ! -e "${LIBRISPEECH}" ]; then
+    log "Fill the value of 'LIBRISPEECH' in db.sh"
+    log "(available at http://openslr.org/12/)"
+    exit 1
+elif [ ! -e "${LIBRISPEECH}/train-clean-100" ]; then
+    log "Please ensure '${LIBRISPEECH}/train-clean-100' exists"
+    exit 1
+elif [ ! -e "${LIBRISPEECH}/train-clean-360" ]; then
+    log "Please ensure '${LIBRISPEECH}/train-clean-360' exists"
+    exit 1
+elif [ ! -e "${LIBRISPEECH}/dev-clean" ]; then
+    log "Please ensure '${LIBRISPEECH}/dev-clean' exists"
+    exit 1
+fi
+
+if [ ! -e "${MUSAN}" ]; then
+    log "Fill the value of 'MUSAN' in db.sh"
+    log "(available at http://openslr.org/17/)"
+    exit 1
+elif [ ! -e "${MUSAN}/noise" ]; then
+    log "Please ensure '${MUSAN}/noise' exists"
+    exit 1
+elif [ ! -e "${MUSAN}/music" ]; then
+    log "Please ensure '${MUSAN}/music' exists"
+    exit 1
+fi
+
+if [ ! -e "${AUDIOSET}" ]; then
+    log "Fill the value of 'AUDIOSET' in db.sh"
+    log "(available at https://github.com/marc-moreaux/audioset_raw)"
+    exit 1
+fi
+
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "Stage 0: Prepare simulation enviroment"
+
+    # Prepare pyrirgen
+    if [ ! -d "local/pyrirgen" ]; then
+        git clone https://github.com/phecda-xu/RIR-Generator.git local/pyrirgen
+        # based on commit ab038671b238fdd8d71df8dabe64137d86947b57
+        patch -i local/pyrirgen.pyx.patch local/pyrirgen/pyrirgen.pyx
+        log "pyrirgen successfully downloaded"
+    fi
+
+    python -m pip install -r local/pyrirgen/requirements.txt
+    curdir=$PWD
+    cd local/pyrirgen
+    make
+    cd "$curdir"
+    log "pyrirgen successfully compiled"
+
+    log "Downloading AISHELL4 repository"
+    URL=https://github.com/felixfuyihui/AISHELL-4.git
+
+    if [ ! -d "${outdir}/AISHELL-4" ] ; then
+        git clone "$URL" "${outdir}/AISHELL-4"
+        # based on commit bad82b77c3753df1b232c5c6491cd3e2f2e32d24
+        patch -i local/generate_rir_trainingdata.py.patch "${outdir}"/AISHELL-4/data_preparation/generate_rir_trainingdata.py
+        patch -i local/generate_isotropic_noise.py.patch "${outdir}"/AISHELL-4/data_preparation/generate_isotropic_noise.py
+        patch -i local/generate_fe_trainingdata.py.patch "${outdir}"/AISHELL-4/data_preparation/generate_fe_trainingdata.py
+        log "git successfully downloaded"
+    fi
+
+    python -m pip install -r "${outdir}/AISHELL-4"/requirements.txt
+
+    # overwrite pyrirgen-related libraries provided in the AISHELL-4 repository,
+    # because they were bound to a specific Python version
+    rm "${outdir}"/AISHELL-4/data_preparation/librirgen.so
+    rm "${outdir}"/AISHELL-4/data_preparation/pyrirgen.*.so
+    ln -s "${curdir}"/local/pyrirgen/librirgen.so "${outdir}"/AISHELL-4/data_preparation/
+    ln -s "${curdir}"/local/pyrirgen/pyrirgen.*.so "${outdir}"/AISHELL-4/data_preparation/
+fi
+
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ] ; then  
+    log "Stage 1: Simulate RIRs"
+
+    mkdir -p "${outdir}/data/rirs"
+    mkdir -p "${outdir}/data/log"
+    export LD_LIBRARY_PATH="${outdir}/AISHELL-4/data_preparation":$LD_LIBRARY_PATH
+
+    curdir=$PWD
+    # You may want to manually modify this file if you made modifications to conf/slurm.conf
+    sed -e 's/--export=PATH$/--export=PATH,LD_LIBRARY_PATH/' "${curdir}/../../TEMPLATE/enh1/conf/slurm.conf" > "${outdir}/data/log/slurm.conf"
+    cd "${outdir}"/AISHELL-4/data_preparation/ || exit 1
+    # This takes ~6.5 hours with Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHZ (nj=32).
+    # In total, ~4.5 GB data will be generated.
+    ${train_cmd} --config "${outdir}/data/log/slurm.conf" JOB=1:${nj} "${outdir}"/data/log/rir.JOB.log \
+        python ./generate_rir_trainingdata.py JOB ${nj} \
+            --output_dir "${outdir}/data/rirs" \
+            --seed 1
+    cd "$curdir"
+
+    # This should generate 12500 RIRs for 2500 rooms.
+    # Each RIR data has shape (24, 8000). The 0-8, 8-16, and 16-24 channels correspond to three sources, respectively.
+    find "${outdir}/data/rirs" -iname "*.wav" | sort > "${outdir}/data/rirs.lst"
+    num_rir=$(<"${outdir}/data/rirs.lst" wc -l)
+    if [ ${num_rir} -ne 12500 ]; then
+        log "Error: Expected 12500 wav files, but got ${num_rir}"
+        exit 1
+    fi
+
+    # 12500 = 11875 (train) + 625 (dev)
+    # This script will generate train_rirs.lst and dev_rirs.lst under "${outdir}/data/"
+    python local/split_train_dev_by_prefix.py "${outdir}/data/rirs.lst" \
+        --num_dev "1/20" \
+        --outfile "${outdir}/data/{}_rirs.lst" \
+        --delim "_" \
+        --prefix_num 3 \
+        --mode "same_size_group"
+fi
+
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "Stage 2: Simulate isotropic noise"
+
+    mkdir -p "${outdir}/data/isotropic_noise"
+    # This takes ~1 hour with Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz.
+    # 200 wav files (~2.9 GB) will be generated.
+    # Each noise sample has shape (8, 960000).
+    python "${outdir}"/AISHELL-4/data_preparation/generate_isotropic_noise.py \
+        --output_dir "${outdir}/data/isotropic_noise" \
+        --wavnum 200 \
+        --seed 1
+
+    # sort in numerically increasing order
+    find "${outdir}/data/isotropic_noise" -iname "*.wav" | \
+        awk -F"${outdir}/data/isotropic_noise/isotropic_" '{print $0, $2}' | \
+        sort -k2 -n | cut -d' ' -f1 > "${outdir}/data/isotropic_noise.lst"
+
+    head -n 190 "${outdir}/data/isotropic_noise.lst" > "${outdir}/data/train_isotropic_noise.lst"
+    tail -n 10 "${outdir}/data/isotropic_noise.lst" > "${outdir}/data/dev_isotropic_noise.lst"
+fi
+
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    log "Stage 3: Prepare speech and noise lists"
+
+    ####################################################
+    # prepare speech list for training and development #
+    ####################################################
+    find "${LIBRISPEECH}/train-clean-100/" "${LIBRISPEECH}/train-clean-360/" -iname "*.flac" | sort > "${outdir}/data/train_speech.lst"
+    find "${LIBRISPEECH}/dev-clean/" -iname "*.flac" | sort > "${outdir}/data/dev_speech.lst"
+
+    # This script will generate train_train_speech.lst and dev_train_speech.lst under "${outdir}/data/"
+    python local/split_train_dev_by_prefix.py "${outdir}/data/train_speech.lst" \
+        --num_dev "1/2" \
+        --outfile "${outdir}/data/{}_train_speech.lst" \
+        --delim "-" \
+        --prefix_num 2 \
+        --mode "similar_size_group"
+
+    # This script will generate train_dev_speech.lst and dev_dev_speech.lst under "${outdir}/data/"
+    python local/split_train_dev_by_prefix.py "${outdir}/data/dev_speech.lst" \
+        --num_dev "1/2" \
+        --outfile "${outdir}/data/{}_dev_speech.lst" \
+        --delim "-" \
+        --prefix_num 2 \
+        --mode "similar_size_group"
+
+    mv "${outdir}/data/train_train_speech.lst" "${outdir}/data/train_speech_spk1.lst"
+    mv "${outdir}/data/dev_train_speech.lst" "${outdir}/data/train_speech_spk2.lst"
+    mv "${outdir}/data/train_dev_speech.lst" "${outdir}/data/dev_speech_spk1.lst"
+    mv "${outdir}/data/dev_dev_speech.lst" "${outdir}/data/dev_speech_spk2.lst"
+
+    ####################################################
+    # prepare noise lists for training and development #
+    ####################################################
+    wget -O "${outdir}/data/audioset.name" https://raw.githubusercontent.com/ConferencingSpeech/ConferencingSpeech2021/master/selected_lists/train/audioset.name
+    wget -O "${outdir}/data/musan.name" https://raw.githubusercontent.com/ConferencingSpeech/ConferencingSpeech2021/master/selected_lists/train/musan.name
+
+    # append category information to the raw audio list
+    sed -i -e 's/\(\(\w\+\-\)\+\w\+\)-[0-9]\+\.wav/\0 \1/g' "${outdir}/data/musan.name"
+    python local/prepare_audioset_category_list.py \
+        "${outdir}/data/audioset.name" \
+        --audioset_dir "$AUDIOSET" \
+        --output_file "${outdir}/data/audioset.name"
+
+    python local/prepare_data_list.py \
+        --outfile "${outdir}/data/musan.lst" \
+        --audiodirs "$MUSAN" \
+        --audio-format "wav" \
+        "${outdir}/data/musan.name"
+
+    python local/prepare_data_list.py \
+        --outfile "${outdir}/data/audioset.lst" \
+        --audiodirs "$AUDIOSET" \
+        --audio-format "wav" \
+        --ignore-missing-files True \
+        "${outdir}/data/audioset.name"
+
+    # This script will generate train_musan.lst and dev_musan.lst under "${outdir}/data/"
+    # 988 = 945 (train) + 43 (dev)
+    python local/split_train_dev_by_column.py "${outdir}/data/musan.lst" \
+        --num_dev "43" \
+        --outfile "${outdir}/data/{}_musan.lst" \
+        --mode "similar_size_group"
+
+    # This script will generate train_audioset.lst and dev_audioset.lst under "${outdir}/data/"
+    # 22418 = 21297 (train) + 1121 (dev)
+    python local/split_train_dev_by_column.py "${outdir}/data/audioset.lst" \
+        --num_dev "1/20" \
+        --outfile "${outdir}/data/{}_audioset.lst" \
+        --mode "similar_size_group"
+
+    cat "${outdir}"/data/train_{musan,audioset}.lst > "${outdir}/data/train_noise.lst"
+    cat "${outdir}"/data/dev_{musan,audioset}.lst > "${outdir}/data/dev_noise.lst"
+    rm "${outdir}/data"/{train,dev}_musan.lst "${outdir}/data"/{train,dev}_audioset.lst
+fi
+
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    log "Stage 4: Simulate training and development data"
+
+    log "Simulating training data"
+    mkdir -p "${outdir}/data/wavs/train"
+    # This takes ~20.5 hours with Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz.
+    # 60000 * 3 (mix, spk1, spk2) wav files (~128 GB) will be generated.
+    # Each audio sample (16 kHz) has 8 channels.
+    python "${outdir}"/AISHELL-4/data_preparation/generate_fe_trainingdata.py \
+        --spk1_list "${outdir}/data/train_speech_spk1.lst" \
+        --spk2_list "${outdir}/data/train_speech_spk2.lst" \
+        --noise_list "${outdir}/data/train_noise.lst" \
+        --rir_list "${outdir}/data/train_rirs.lst" \
+        --isotropic_list "${outdir}/data/train_isotropic_noise.lst" \
+        --mode "train" \
+        --output_dir "${outdir}/data/wavs" \
+        --wavnum 60000
+
+    log "Simulating development data"
+    mkdir -p "${outdir}/data/wavs/dev"
+    # This takes ~1 hour with Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz.
+    # 3000 * 3 (mix, spk1, spk2) wav files (~6.2 GB) will be generated.
+    # Each audio sample (16 kHz) has 8 channels.
+    python "${outdir}"/AISHELL-4/data_preparation/generate_fe_trainingdata.py \
+        --spk1_list "${outdir}/data/dev_speech_spk1.lst" \
+        --spk2_list "${outdir}/data/dev_speech_spk2.lst" \
+        --noise_list "${outdir}/data/dev_noise.lst" \
+        --rir_list "${outdir}/data/dev_rirs.lst" \
+        --isotropic_list "${outdir}/data/dev_isotropic_noise.lst" \
+        --mode "dev" \
+        --output_dir "${outdir}/data/wavs" \
+        --wavnum 3000
+fi
+
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    log "Stage 5: Prepare training and development data"
+
+    mkdir -p data/train
+    find "${outdir}/data/wavs/train/mix" -iname "*.wav" > "${outdir}/data/wavs/train/mix.lst"
+    sed -e 's/\.\(wav\|flac\)//' "${outdir}/data/wavs/train/mix.lst" | \
+        awk -F '/' '{print $NF}' > "${outdir}/data/wavs/train/mix_id.lst"
+    paste -d' ' "${outdir}/data/wavs/train/mix_id.lst" "${outdir}/data/wavs/train/mix.lst" | sort -u > data/train/wav.scp
+    sed -e "s#${outdir}/data/wavs/train/mix/#${outdir}/data/wavs/train/spk1/#g" data/train/wav.scp > data/train/spk1.scp
+    sed -e "s#${outdir}/data/wavs/train/mix/#${outdir}/data/wavs/train/spk2/#g" data/train/wav.scp > data/train/spk2.scp
+    awk '{print $1, "dummy"}' data/train/wav.scp > data/train/utt2spk
+    utils/utt2spk_to_spk2utt.pl data/train/utt2spk > data/train/spk2utt
+    utils/validate_data_dir.sh --no-feats --no-text data/train
+    rm "${outdir}/data/wavs/train/mix_id.lst"
+
+    mkdir -p data/dev
+    find "${outdir}/data/wavs/dev/mix" -iname "*.wav" > "${outdir}/data/wavs/dev/mix.lst"
+    sed -e 's/\.\(wav\|flac\)//' "${outdir}/data/wavs/dev/mix.lst" | \
+        awk -F '/' '{print $NF}' > "${outdir}/data/wavs/dev/mix_id.lst"
+    paste -d' ' "${outdir}/data/wavs/dev/mix_id.lst" "${outdir}/data/wavs/dev/mix.lst" | sort -u > data/dev/wav.scp
+    sed -e "s#${outdir}/data/wavs/dev/mix/#${outdir}/data/wavs/dev/spk1/#g" data/dev/wav.scp > data/dev/spk1.scp
+    sed -e "s#${outdir}/data/wavs/dev/mix/#${outdir}/data/wavs/dev/spk2/#g" data/dev/wav.scp > data/dev/spk2.scp
+    awk '{print $1, "dummy"}' data/dev/wav.scp > data/dev/utt2spk
+    utils/utt2spk_to_spk2utt.pl data/dev/utt2spk > data/dev/spk2utt
+    utils/validate_data_dir.sh --no-feats --no-text data/dev
+    rm "${outdir}/data/wavs/dev/mix_id.lst"
+fi
+
+
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+    log "Stage 6: Prepare the evaluation data for the Speaker Independent task"
+
+    find "${AISHELL4}/test/wav" -iname "*.flac" > "${outdir}"/data/test_wav.lst
+    find "${AISHELL4}/test/TextGrid" -iname "*.TextGrid" > "${outdir}"/data/test_TextGrid.lst
+
+    mkdir -p "${outdir}/data/wavs/test"
+    # This takes ~12 minutes with Intel(R) Xeon(R) Gold 6132 CPU @ 2.60GHz.
+    # 10381 wav files (~11 GB) will be generated.
+    # Each audio sample (16 kHz) has 8 channels.
+    python "${outdir}"/AISHELL-4/data_preparation/generate_nospk_testdata.py \
+        --wav_list "${outdir}/data/test_wav.lst" \
+        --textgrid_list "${outdir}/data/test_TextGrid.lst" \
+        --output_dir "${outdir}/data/wavs"
+
+    mkdir -p data/test
+    find "${outdir}/data/wavs/test" -iname "*.wav" > "${outdir}/data/wavs/test/wav.lst"
+    sed -e 's/\.\(wav\|flac\)//' "${outdir}/data/wavs/test/wav.lst" | \
+        awk -F '/' '{print $NF}' > "${outdir}/data/wavs/test/wav_id.lst"
+    paste -d' ' "${outdir}/data/wavs/test/wav_id.lst" "${outdir}/data/wavs/test/wav.lst" | sort -u > data/test/wav.scp
+    ln -s wav.scp data/test/spk1.scp
+    ln -s wav.scp data/test/spk2.scp
+    awk '{print $1, "dummy"}' data/test/wav.scp > data/test/utt2spk
+    utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt
+    utils/validate_data_dir.sh --no-feats --no-text data/test
+    rm "${outdir}/data/wavs/test/wav_id.lst"
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/aishell4/enh1/local/generate_fe_trainingdata.py.patch b/egs2/aishell4/enh1/local/generate_fe_trainingdata.py.patch
new file mode 100644
index 00000000000..a7666a5a756
--- /dev/null
+++ b/egs2/aishell4/enh1/local/generate_fe_trainingdata.py.patch
@@ -0,0 +1,51 @@
+--- generate_fe_trainingdata.old.py
++++ generate_fe_trainingdata.new.py
+@@ -1,8 +1,8 @@
+ #!/usr/bin/env python
+ 
+-import io
++from distutils.version import LooseVersion
+ import os
+-import subprocess
++import sys
+ import linecache
+ import numpy as np
+ import soundfile as sf
+@@ -12,6 +12,10 @@
+ import librosa
+ import argparse
+ 
++
++is_py_3_3_plus = LooseVersion(sys.version) > LooseVersion("3.3")
++
++
+ def get_line_context(file_path, line_number):
+     return linecache.getline(file_path, line_number).strip()
+ 
+@@ -119,7 +123,7 @@
+         return data / max_val
+ 
+ def add_noise(clean, noise, rir, snr):
+-    random.seed(time.clock())
++    random.seed(time.perf_counter() if is_py_3_3_plus else time.clock())
+     if len(noise.shape) == 1 and len(clean.shape) > 1:
+         noise = add_reverb(noise, rir[:, 16:24])
+         noise = noise[:-7999]
+@@ -189,7 +193,7 @@
+ 
+     for i in range(args.wavnum):
+ 
+-        random.seed(time.clock())
++        random.seed(time.perf_counter() if is_py_3_3_plus else time.clock())
+         wav1idx = random.randint(0, len(open(wavlist1,'r').readlines())-1)
+         wav2idx = random.randint(0, len(open(wavlist2,'r').readlines())-1)
+         noiseidx = random.randint(0, len(open(noiselist,'r').readlines())-1)
+@@ -200,7 +204,7 @@
+         noise_path = get_line_context(noiselist, noiseidx+1)
+         rir_path = get_line_context(rirlist, riridx+1)
+         isotropic_path = get_line_context(isolist, isotropicidx+1)
+-        random.seed(time.clock())
++        random.seed(time.perf_counter() if is_py_3_3_plus else time.clock())
+         snr = random.uniform(5, 20)
+         sir = random.uniform(-5, 5)
+         isosnr = random.uniform(15,25)
diff --git a/egs2/aishell4/enh1/local/generate_isotropic_noise.py.patch b/egs2/aishell4/enh1/local/generate_isotropic_noise.py.patch
new file mode 100644
index 00000000000..e4c2184ade3
--- /dev/null
+++ b/egs2/aishell4/enh1/local/generate_isotropic_noise.py.patch
@@ -0,0 +1,21 @@
+--- generate_isotropic_noise.old.py
++++ generate_isotropic_noise.new.py
+@@ -227,6 +227,7 @@
+ 
+ 
+ def run(args):
++    np.random.seed(args.seed)
+     if not os.path.exists(args.output_dir):
+         os.makedirs(args.output_dir)
+     mic_distance = 0.05
+@@ -256,5 +257,9 @@
+                         type=int,
+                         help="total number of simulated wavs",
+                         default=200)
++    parser.add_argument("--seed",
++                        type=int,
++                        help="random seed",
++                        default=1)
+     args = parser.parse_args()
+     run(args)
+\ No newline at end of file
diff --git a/egs2/aishell4/enh1/local/generate_rir_trainingdata.py.patch b/egs2/aishell4/enh1/local/generate_rir_trainingdata.py.patch
new file mode 100644
index 00000000000..3907630e19f
--- /dev/null
+++ b/egs2/aishell4/enh1/local/generate_rir_trainingdata.py.patch
@@ -0,0 +1,117 @@
+--- /Users/wyz/Downloads/generate_rir_trainingdata.old.py	2022-03-25 19:13:20.000000000 +0800
++++ /Users/wyz/Downloads/generate_rir_trainingdata.new.py	2022-03-26 22:31:53.000000000 +0800
+@@ -13,6 +13,7 @@
+ import os
+ 
+ def run(args):
++    np.random.seed(args.seed)
+     if not os.path.exists(args.output_dir):
+         os.makedirs(args.output_dir)
+     c = 340
+@@ -25,11 +26,31 @@
+     nb_src = 3  # Number of sources
+     nb_rcv = 8 # Number of receivers
+ 
+-    for i in range(num_room):
++    if args.idx < 1 or args.nj < 1 or args.idx > args.nj:
++        raise ValueError("Invalid arguments for multi-processing")
++
++    num = math.ceil(num_room / args.nj)
++    start = (args.idx - 1) * num
++    if args.idx < args.nj:
++        stop = start + num
++    else:
++        stop = num_room
++
++    print('Simulate RIRs from %d to %d\n' % (start, stop))
++    unique_rooms = []
++    for i in range(stop):
+         x = np.random.uniform(3, room_x)
+         y = np.random.uniform(3, room_y)
+         z = room_z
+-        for j in range(utt_per_room):
++        room_stats = ('%.2f' % x) + '_' + ('%.2f' % y) + '_' + ('%.2f' % z)
++        while room_stats in unique_rooms:
++            x = np.random.uniform(3, room_x)
++            y = np.random.uniform(3, room_y)
++            room_stats = ('%.2f' % x) + '_' + ('%.2f' % y) + '_' + ('%.2f' % z)
++        unique_rooms.append(room_stats)
++
++        j = 0
++        while j < utt_per_room:
+             mic_distance = 0.05
+             room_sz = [x, y, z]  # Size of the room [m]
+             pos_src1 = [np.random.uniform(0, x),np.random.uniform(0, y),np.random.uniform(1.2, 1.9)]
+@@ -55,21 +76,6 @@
+             mic_pattern = "omnidirectional" # Receiver polar pattern
+             T60 = np.random.uniform(0.2, 0.8)    # Time for the RIR to reach 60dB of attenuation [s]
+ 
+-            RIRs1 = pyrirgen.generateRir(room_sz, pos_src1, pos_rcv, soundVelocity=c, fs=fs, reverbTime = T60, nSamples = 8000, 
+-                                        micType = mic_pattern, nOrder=-1, nDim=3, isHighPassFilter=True) #source * mic * time
+-            RIRs2 = pyrirgen.generateRir(room_sz, pos_src2, pos_rcv, soundVelocity=c, fs=fs, reverbTime = T60, nSamples = 8000, 
+-                                        micType = mic_pattern, nOrder=-1, nDim=3, isHighPassFilter=True) #source * mic * time
+-            RIRs3 = pyrirgen.generateRir(room_sz, pos_src3, pos_rcv, soundVelocity=c, fs=fs, reverbTime = T60, nSamples = 8000, 
+-                                        micType = mic_pattern, nOrder=-1, nDim=3, isHighPassFilter=True) #source * mic * time
+-            RIRs1=np.array(RIRs1)
+-            RIRs2=np.array(RIRs2)
+-            RIRs3=np.array(RIRs3)
+-            
+-            out = np.zeros([24, 8000])
+-            out[0:8]= RIRs1
+-            out[8:16]= RIRs2
+-            out[16:24]= RIRs3
+-            out = out.transpose(1,0)
+             pos_src = np.array(pos_src1)
+             pos_src2 = np.array(pos_src2)
+             pos_src3 = np.array(pos_src3)
+@@ -89,17 +95,50 @@
+             matrix_1m = np.array([0.5, 0.5, 0.5])
+             matrix_5m = np.array([5.0, 5.0, 5.0])
+             if (distance>matrix_1m).all() and (distance<matrix_5m).all() and abs(angle[1] - angle[0])>20.0 and abs(angle[2] - angle[0])>20.0 and abs(angle[2] - angle[1])>20.0:
++                j += 1
++                if i < start:
++                    # Skip the actual simulation util it reachs the index range of the current process.
++                    # This is to ensure that the generated data is consistent when using a different number of processes.
++                    continue
+                 wav_name = args.output_dir+'/' + \
+                            ('%.2f' % x) + '_' + ('%.2f' % y) + '_' + ('%.2f' % z) + '_' + \
+                            ('%.2f' % distance[0]) + '_' + ('%.2f' % distance[1]) + '_' + \
+                            ('%.4f' % angle[0]) + '_' + ('%.4f' % angle[1])+ '_' + ('%.4f' % T60) + '.wav'
++
++                RIRs1 = pyrirgen.generateRir(room_sz, pos_src1, pos_rcv, soundVelocity=c, fs=fs, reverbTime = T60, nSamples = 8000, 
++                                        micType = mic_pattern, nOrder=-1, nDim=3, isHighPassFilter=True) #source * mic * time
++                RIRs2 = pyrirgen.generateRir(room_sz, pos_src2, pos_rcv, soundVelocity=c, fs=fs, reverbTime = T60, nSamples = 8000, 
++                                            micType = mic_pattern, nOrder=-1, nDim=3, isHighPassFilter=True) #source * mic * time
++                RIRs3 = pyrirgen.generateRir(room_sz, pos_src3, pos_rcv, soundVelocity=c, fs=fs, reverbTime = T60, nSamples = 8000, 
++                                            micType = mic_pattern, nOrder=-1, nDim=3, isHighPassFilter=True) #source * mic * time
++                RIRs1=np.array(RIRs1)
++                RIRs2=np.array(RIRs2)
++                RIRs3=np.array(RIRs3)
++                
++                out = np.zeros([24, 8000])
++                out[0:8]= RIRs1
++                out[8:16]= RIRs2
++                out[16:24]= RIRs3
++                out = out.transpose(1,0)
++
+                 sf.write(wav_name,out,16000)
+ 
++
+ if __name__ == '__main__':
+     parser = argparse.ArgumentParser()
++    parser.add_argument("idx",
++                        type=int,
++                        help="index of the current process (from 1 to `nj`)")
++    parser.add_argument("nj",
++                        type=int,
++                        help="total number of parallel processes")
+     parser.add_argument("--output_dir",
+                         type=str,
+                         help="output_dir",
+                         default="rir")
++    parser.add_argument("--seed",
++                        type=int,
++                        help="random seed",
++                        default=1)
+     args = parser.parse_args()
+     run(args)
diff --git a/egs2/aishell4/enh1/local/path.sh b/egs2/aishell4/enh1/local/path.sh
new file mode 100644
index 00000000000..6322b27f35c
--- /dev/null
+++ b/egs2/aishell4/enh1/local/path.sh
@@ -0,0 +1 @@
+export PYTHONPATH=$PWD/local:$PYTHONPATH
diff --git a/egs2/aishell4/enh1/local/prepare_audioset_category_list.py b/egs2/aishell4/enh1/local/prepare_audioset_category_list.py
new file mode 100644
index 00000000000..2c9a09bb0c6
--- /dev/null
+++ b/egs2/aishell4/enh1/local/prepare_audioset_category_list.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python
+
+# Copyright 2022  Shanghai Jiao Tong University (Author: Wangyou Zhang)
+# Apache 2.0
+from pathlib import Path
+import re
+import sys
+
+
+def prepare_audioset_category(audio_list, audioset_dir, output_file, skip_csv_rows=3):
+    audios = []
+    with Path(audio_list).open("r", encoding="utf-8") as f:
+        for line in f:
+            if not line.strip():
+                continue
+            audios.append(line.strip())
+
+    utt2category = {}
+    for csv in Path(audioset_dir).rglob("*.csv"):
+        with csv.open("r", encoding="utf-8") as f:
+            for idx, line in enumerate(f):
+                if idx < skip_csv_rows:
+                    continue
+                # --PJHxphWEs, 30.000, 40.000, "/m/09x0r,/t/dd00088"
+                try:
+                    YTID, start_seconds, end_seconds, positive_labels = re.split(
+                        r",\s*", line.strip(), maxsplit=3
+                    )
+                except ValueError as err:
+                    exc_type, exc_obj, exc_tb = sys.exc_info()
+                    err_file = Path(exc_tb.tb_frame.f_code.co_filename).name
+                    err_file_line = exc_tb.tb_lineno
+                    print(
+                        "=== Warning: skipping '%s' due to the following error ==="
+                        % csv
+                    )
+                    print(
+                        "  [%s (line %s)] %s: %s"
+                        % (err_file, err_file_line, exc_type.__name__, err)
+                    )
+                    break
+                positive_labels = re.sub(r'^"(.*)"$', r"\1", positive_labels)
+                positive_labels = ",".join(sorted(re.split(r",\s*", positive_labels)))
+                utt2category[YTID] = positive_labels
+
+    ret = []
+    for audio in audios:
+        ytid = re.sub(r"(.*)_\d+\.\d+", r"\1", Path(audio).stem)
+        ret.append("%s %s\n" % (ytid, utt2category[ytid]))
+
+    outfile = Path(output_file)
+    outfile.parent.mkdir(parents=True, exist_ok=True)
+    with outfile.open("w") as f:
+        for line in ret:
+            f.write(line)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "audio_list",
+        type=str,
+        help="Path to a text file containing the list of audios in Audioset",
+    )
+    parser.add_argument(
+        "--audioset_dir",
+        type=str,
+        required=True,
+        help="Path to the Audioset root directory",
+    )
+    parser.add_argument(
+        "--skip_csv_rows",
+        type=str,
+        default=3,
+        help="Line numbers to skip from top while reading csv",
+    )
+    parser.add_argument(
+        "--output_file",
+        type=str,
+        required=True,
+        help="Path to the file for write audio list with category information",
+    )
+    args = parser.parse_args()
+
+    prepare_audioset_category(
+        args.audio_list,
+        args.audioset_dir,
+        args.output_file,
+        skip_csv_rows=args.skip_csv_rows,
+    )
diff --git a/egs2/aishell4/enh1/local/prepare_data_list.py b/egs2/aishell4/enh1/local/prepare_data_list.py
new file mode 120000
index 00000000000..bb938f78963
--- /dev/null
+++ b/egs2/aishell4/enh1/local/prepare_data_list.py
@@ -0,0 +1 @@
+../../../conferencingspeech21/enh1/local/prepare_data_list.py
\ No newline at end of file
diff --git a/egs2/aishell4/enh1/local/pyrirgen.pyx.patch b/egs2/aishell4/enh1/local/pyrirgen.pyx.patch
new file mode 100644
index 00000000000..4159c7a48b9
--- /dev/null
+++ b/egs2/aishell4/enh1/local/pyrirgen.pyx.patch
@@ -0,0 +1,11 @@
+--- pyrirgen.old.pyx
++++ pyrirgen.new.pyx
+@@ -1,7 +1,7 @@
+ cimport cdefs
+ import collections
+ 
+-def rir_generator(soundVelocity, fs, sourcePosition, receiverPositions, roomMeasures, *, reverbTime=None, betaCoeffs=None, orientation=[.0, .0], bint isHighPassFilter=True, int nDim=3, int nOrder=-1, int nSamples=-1, micType='o'):
++def generateRir(roomMeasures, sourcePosition, receiverPositions, *, reverbTime=None, betaCoeffs=None, float soundVelocity=340, float fs=16000, orientation=[.0, .0], bint isHighPassFilter=True, int nDim=3, int nOrder=-1, int nSamples=-1, micType='o'):
+ 	""" Computes the response of an acoustic source to one or more microphones in a reverberant room using the image method [1,2].
+ 
+ 	Room Impulse Response Generator
diff --git a/egs2/aishell4/enh1/local/split_train_dev.py b/egs2/aishell4/enh1/local/split_train_dev.py
new file mode 100755
index 00000000000..8961c40b12d
--- /dev/null
+++ b/egs2/aishell4/enh1/local/split_train_dev.py
@@ -0,0 +1,241 @@
+#!/usr/bin/env python
+
+# Copyright 2022  Shanghai Jiao Tong University (Authors: Wangyou Zhang)
+# Apache 2.0
+from collections import Counter
+from collections import defaultdict
+from fractions import Fraction
+import math
+from pathlib import Path
+import random
+from typing import List
+from typing import Tuple
+
+
+def int_or_float_or_numstr(value):
+    if isinstance(value, int):
+        return value
+    elif isinstance(value, float):
+        assert 0 < value < 1, value
+        return Fraction(value)
+    elif isinstance(value, (str, Fraction)):
+        num = Fraction(value)
+        if num.denominator == 1:
+            return num.numerator  # int
+        else:
+            return num
+    else:
+        raise TypeError("Unsupported value type: %s" % type(value))
+
+
+def split_train_dev(all_data, num_dev, outfile):
+    """Group all paths listed in `datalist` according to their name prefix,
+    and split all groups into train and dev subsets.
+
+    Each subset has distinct groups.
+
+    This is designed for splitting same-size groups into train and dev subsets.
+    That is, all groups must have exactly the same number of paths.
+
+    Args:
+        all_data (Dict[group_name: List[str]]): all paths grouped by their category
+        num_dev (int or Fraction): number/percentage of the samples for the dev set
+        outfile (str): template path to the ourput file
+    """  # noqa: H405
+
+    # all groups must have the same number of paths
+    group_id0 = next(iter(all_data.keys()))
+    group_length = len(all_data[group_id0])
+    for k, v in all_data.items():
+        assert len(v) == group_length, (k, len(v), group_length)
+    total_num = group_length * len(all_data)
+
+    # determine number of groups for dev subset
+    if isinstance(num_dev, int):
+        assert num_dev % group_length == 0, (num_dev, group_length)
+        num_dev_groups = num_dev // group_length
+    elif isinstance(num_dev, Fraction):
+        num_dev_groups = num_dev * total_num / group_length
+        if num_dev_groups.denominator == 1:
+            num_dev_groups = num_dev_groups.numerator
+        else:
+            num_dev_groups = round(num_dev_groups)
+            print(
+                "Warning: num_dev_groups is rounded to the nearest integer "
+                f"{num_dev_groups}."
+            )
+    else:
+        raise TypeError("Unsupported data type: %s" % type(num_dev))
+
+    groups = list(all_data.keys())
+    random.shuffle(groups)
+    dev_groups = groups[:num_dev_groups]
+    train_groups = groups[num_dev_groups:]
+
+    outdir = Path(outfile).expanduser().resolve().parent
+    outdir.mkdir(parents=True, exist_ok=True)
+    assert "{}" in outfile, outfile
+    with Path(outfile.format("dev")).open("w") as out:
+        for room in dev_groups:
+            for wav in all_data[room]:
+                out.write(wav)
+    with Path(outfile.format("train")).open("w") as out:
+        for room in train_groups:
+            for wav in all_data[room]:
+                out.write(wav)
+
+
+def split_train_dev_v2(
+    all_data, num_dev, outfile, allowed_deviation=0, max_solutions=50
+):
+    """Group all paths listed in args.datalist according to their name prefix,
+    and split all groups into train and dev subsets.
+
+    Each subset has distinct groups.
+
+    This is designed for splitting similar-size groups into train and dev subsets.
+    That is, all groups must have, if not the same, a similar number of paths.
+
+    Args:
+        all_data (Dict[group_name: List[str]]): all paths grouped by their category
+        num_dev (int or Fraction): number/percentage of the samples for the dev set
+        outfile (str): template path to the ourput file
+        allowed_deviation (int): Allowed number of samples for the final dev split to be
+                                less than or more than the specified `num_dev`
+        max_solutions (int): maximum number of possible coin change solutions to search
+    """  # noqa: H405
+
+    lengths = [len(v) for v in all_data.values()]
+    total_length = sum(lengths)
+    mean_length = total_length / len(lengths)
+    print(
+        f"len(group_lengths)={len(lengths)}\n"
+        f"max(group_lengths)={max(lengths)}, min(group_lengths)={min(lengths)}, "
+        f"mean(group_lengths)={mean_length:.2f}\n"
+    )
+
+    # determine number of groups for dev subset
+    if isinstance(num_dev, int):
+        num_dev_samples = num_dev
+    elif isinstance(num_dev, Fraction):
+        num_dev_samples = num_dev * total_length
+        if num_dev_samples.denominator == 1:
+            num_dev_samples = num_dev_samples.numerator
+        else:
+            num_dev_samples = round(num_dev_samples)
+            print(
+                "Warning: num_dev_samples is rounded to the nearest integer "
+                f"({num_dev_samples})."
+            )
+    else:
+        raise TypeError("Unsupported data type: %s" % type(num_dev))
+
+    # Solve this assignment problem like the recursive Coin Change Problem
+    choices = find_all_coin_change_ways(
+        lengths,
+        num_dev_samples,
+        allowed_deviation=allowed_deviation,
+        max_solutions=max_solutions,
+    )
+    if len(choices) == 0:
+        raise ValueError(
+            "Current find an exact solution to match num_dev_samples (=%d)\n"
+            "Please modify --num_dev or consider using a larger value for "
+            "--allowed_deviation" % num_dev_samples
+        )
+
+    choice = random.choice(choices)
+    groups_count = Counter(choice)
+    pool = defaultdict(list)
+    groups = list(all_data.keys())
+    for i, length in enumerate(lengths):
+        if length in groups_count:
+            pool[length].append(i)
+
+    selected_idx = []
+    dev_groups = []
+    for length, num in groups_count.items():
+        name_idxs = random.sample(pool[length], num)
+        selected_idx.extend(name_idxs)
+        for idx in name_idxs:
+            dev_groups.append(groups[idx])
+    for idx in sorted(selected_idx, reverse=True):
+        groups.pop(idx)
+    train_groups = groups
+
+    outdir = Path(outfile).expanduser().resolve().parent
+    outdir.mkdir(parents=True, exist_ok=True)
+    assert "{}" in outfile, outfile
+    with Path(outfile.format("dev")).open("w") as out:
+        for room in dev_groups:
+            for wav in all_data[room]:
+                out.write(wav)
+    with Path(outfile.format("train")).open("w") as out:
+        for room in train_groups:
+            for wav in all_data[room]:
+                out.write(wav)
+
+
+def find_all_coin_change_ways(
+    coins: List, amount: int, allowed_deviation: int = 0, max_solutions=50
+):
+    def coin_change(unique_coins: List, amount: int, tmp_ret: Tuple = ()):
+        """Search in ascending order."""
+        if len(unique_coins) == 0 or len(all_combinations) > max_solutions:
+            return
+        biggest_coin, rest_coins = unique_coins[0], unique_coins[1:]
+        if allowed_deviation > 0:
+            num = math.ceil(amount / biggest_coin)
+        else:
+            num = amount // biggest_coin
+        for i in range(1 + num):
+            if i > coin_num[biggest_coin]:
+                break
+            remainder = amount - biggest_coin * i
+            if abs(remainder) <= allowed_deviation:
+                new_combo = tmp_ret + (biggest_coin,) * i
+                if allowed_deviation > 0:
+                    if len(all_combinations) == 0 or new_combo != all_combinations[-1]:
+                        all_combinations.append(new_combo)
+                    coin_change(rest_coins, remainder, tmp_ret + (biggest_coin,) * i)
+                else:
+                    all_combinations.append(new_combo)
+                    break
+            elif remainder > 0:
+                coin_change(rest_coins, remainder, tmp_ret + (biggest_coin,) * i)
+            else:
+                break
+
+    def coin_change_v2(unique_coins: List, amount: int, tmp_ret: Tuple = ()):
+        """Search in descending order."""
+        if len(unique_coins) == 0 or len(all_combinations) > max_solutions:
+            return
+        biggest_coin, rest_coins = unique_coins[0], unique_coins[1:]
+        if allowed_deviation > 0:
+            num = min(coin_num[biggest_coin], math.ceil(amount / biggest_coin))
+        else:
+            num = min(coin_num[biggest_coin], amount // biggest_coin)
+        for i in range(num, -1, -1):
+            remainder = amount - biggest_coin * i
+            if abs(remainder) <= allowed_deviation:
+                new_combo = tmp_ret + (biggest_coin,) * i
+                if allowed_deviation > 0:
+                    if len(all_combinations) == 0 or new_combo != all_combinations[-1]:
+                        all_combinations.append(new_combo)
+                    length = len(all_combinations)
+                    coin_change_v2(rest_coins, remainder, tmp_ret + (biggest_coin,) * i)
+                    if len(all_combinations) == length:
+                        break
+                else:
+                    all_combinations.append(new_combo)
+            elif remainder > 0:
+                coin_change_v2(rest_coins, remainder, tmp_ret + (biggest_coin,) * i)
+
+    coin_num = Counter(coins)
+    unique_coins = sorted(coin_num.keys(), reverse=True)  # in descending order
+    all_combinations = []
+    if unique_coins[0] * coin_num[unique_coins[0]] < amount:
+        coin_change_v2(unique_coins, amount)
+    else:
+        coin_change(unique_coins, amount)
+    return all_combinations
diff --git a/egs2/aishell4/enh1/local/split_train_dev_by_column.py b/egs2/aishell4/enh1/local/split_train_dev_by_column.py
new file mode 100755
index 00000000000..ff50a9407a7
--- /dev/null
+++ b/egs2/aishell4/enh1/local/split_train_dev_by_column.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python
+
+# Copyright 2022  Shanghai Jiao Tong University (Authors: Wangyou Zhang)
+# Apache 2.0
+import argparse
+from collections import defaultdict
+from pathlib import Path
+import random
+
+from split_train_dev import int_or_float_or_numstr
+from split_train_dev import split_train_dev
+from split_train_dev import split_train_dev_v2
+
+
+def get_parser():
+    """Argument parser."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("datalist", type=str, help="Path to the list of audio files")
+    parser.add_argument(
+        "--num_dev",
+        type=int_or_float_or_numstr,
+        required=True,
+        help="Number of samples to assign to the development set "
+        "(can be an integer, a float number, or a numeric string like '1/3')",
+    )
+    parser.add_argument(
+        "--outfile",
+        type=str,
+        default="{}.lst",
+        help="A template path for storing output",
+    )
+    parser.add_argument(
+        "--column_delim",
+        type=str,
+        default=None,
+        help="Delimiter for splitting columns in each file line",
+    )
+    parser.add_argument(
+        "--mode",
+        type=str,
+        default=True,
+        choices=("same_size_group", "similar_size_group"),
+        help="'same_size_group': sample from same-size groups "
+        "to gather `num_dev` samples\n"
+        "'similar_size_group': sample from similar-size groups "
+        "to gather `num_dev` samples",
+    )
+    parser.add_argument(
+        "--allowed_deviation",
+        type=int,
+        default=0,
+        help="How many samples are allowed for the final dev split to be less than "
+        "or more than the specified `num_dev` (only for mode='similar_size_group')",
+    )
+    parser.add_argument(
+        "--max_solutions",
+        type=int,
+        default=50,
+        help="Maximum number of possible coin change solutions to search (only for "
+        "mode='similar_size_group')",
+    )
+    parser.add_argument("--seed", type=int, default=1, help="random seed")
+    return parser
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+    random.seed(args.seed)
+
+    datalist = Path(args.datalist).expanduser().resolve()
+    all_data = defaultdict(list)
+    with datalist.open("r") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            # e.g. 6JXXQGX5Osg_30.000.wav /m/02y_763
+            if args.column_delim is None or args.column_delim == "":
+                fpath, group_id = line.split(maxsplit=1)
+            else:
+                fpath, group_id = line.split(args.column_delim, maxsplit=1)
+            all_data[group_id].append(fpath + "\n")
+
+    if args.mode == "same_size_group":
+        split_train_dev(
+            all_data,
+            args.num_dev,
+            args.outfile,
+        )
+    elif args.mode == "similar_size_group":
+        split_train_dev_v2(
+            all_data,
+            args.num_dev,
+            args.outfile,
+            allowed_deviation=args.allowed_deviation,
+            max_solutions=args.max_solutions,
+        )
+    else:
+        raise ValueError("Unsupported mode: %s" % args.mdoe)
diff --git a/egs2/aishell4/enh1/local/split_train_dev_by_prefix.py b/egs2/aishell4/enh1/local/split_train_dev_by_prefix.py
new file mode 100755
index 00000000000..c04cfb1a584
--- /dev/null
+++ b/egs2/aishell4/enh1/local/split_train_dev_by_prefix.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python
+
+# Copyright 2022  Shanghai Jiao Tong University (Authors: Wangyou Zhang)
+# Apache 2.0
+import argparse
+from collections import defaultdict
+from pathlib import Path
+import random
+
+from split_train_dev import int_or_float_or_numstr
+from split_train_dev import split_train_dev
+from split_train_dev import split_train_dev_v2
+
+
+def get_parser():
+    """Argument parser."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("datalist", type=str, help="Path to the list of audio files")
+    parser.add_argument(
+        "--num_dev",
+        type=int_or_float_or_numstr,
+        required=True,
+        help="Number of samples to assign to the development set "
+        "(can be an integer, a float number, or a numeric string like '1/3')",
+    )
+    parser.add_argument(
+        "--outfile",
+        type=str,
+        default="{}.lst",
+        help="A template path for storing output",
+    )
+    parser.add_argument(
+        "--delim",
+        type=str,
+        default="_",
+        help="Delimiter for determining the prefix for grouping",
+    )
+    parser.add_argument(
+        "--prefix_num",
+        type=int,
+        default=3,
+        help="Number of preceding splits (after delimiting) to form the prefix",
+    )
+    parser.add_argument(
+        "--mode",
+        type=str,
+        default=True,
+        choices=("same_size_group", "similar_size_group"),
+        help="'same_size_group': sample from same-size groups "
+        "to gather `num_dev` samples\n"
+        "'similar_size_group': sample from similar-size groups "
+        "to gather `num_dev` samples",
+    )
+    parser.add_argument(
+        "--allowed_deviation",
+        type=int,
+        default=0,
+        help="how many samples are allowed for the final dev split to be less than "
+        "or more than the specified `num_dev`",
+    )
+    parser.add_argument(
+        "--max_solutions",
+        type=int,
+        default=50,
+        help="Maximum number of possible coin change solutions to search (only for "
+        "mode='similar_size_group')",
+    )
+    parser.add_argument("--seed", type=int, default=1, help="random seed")
+    return parser
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+    random.seed(args.seed)
+
+    datalist = Path(args.datalist).expanduser().resolve()
+    all_data = defaultdict(list)
+    with datalist.open("r") as f:
+        for fpath in f:
+            if not fpath:
+                continue
+            # e.g. 3.00_7.76_3.00_1.06_3.47_289.5578_240.7477_0.7049
+            fname = Path(fpath).stem
+            group_id = args.delim.join(fname.split(args.delim)[: args.prefix_num])
+            all_data[group_id].append(fpath)
+
+    if args.mode == "same_size_group":
+        split_train_dev(
+            all_data,
+            args.num_dev,
+            args.outfile,
+        )
+    elif args.mode == "similar_size_group":
+        split_train_dev_v2(
+            all_data,
+            args.num_dev,
+            args.outfile,
+            allowed_deviation=args.allowed_deviation,
+            max_solutions=args.max_solutions,
+        )
+    else:
+        raise ValueError("Unsupported mode: %s" % args.mdoe)
diff --git a/egs2/aishell4/enh1/path.sh b/egs2/aishell4/enh1/path.sh
new file mode 120000
index 00000000000..eb217d35673
--- /dev/null
+++ b/egs2/aishell4/enh1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/enh1/path.sh
\ No newline at end of file
diff --git a/egs2/aishell4/enh1/pyscripts b/egs2/aishell4/enh1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/aishell4/enh1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/aishell4/enh1/run.sh b/egs2/aishell4/enh1/run.sh
new file mode 100755
index 00000000000..16cc3f82f63
--- /dev/null
+++ b/egs2/aishell4/enh1/run.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+outdir=aishell4_simu
+sample_rate=16k
+
+
+train_set="train"
+valid_set="dev"
+test_sets="test"
+
+./enh.sh \
+    --audio_format wav \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --fs "${sample_rate}" \
+    --lang zh \
+    --ngpu 1 \
+    --local_data_opts "--outdir ${outdir}" \
+    --enh_config conf/tuning/train_enh_beamformer_no_wpe.yaml \
+    "$@"
diff --git a/egs2/aishell4/enh1/scripts b/egs2/aishell4/enh1/scripts
new file mode 120000
index 00000000000..9aeb3f26509
--- /dev/null
+++ b/egs2/aishell4/enh1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/enh1/scripts
\ No newline at end of file
diff --git a/egs2/aishell4/enh1/steps b/egs2/aishell4/enh1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/aishell4/enh1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/aishell4/enh1/utils b/egs2/aishell4/enh1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/aishell4/enh1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/bn_openslr53/asr1/README.md b/egs2/bn_openslr53/asr1/README.md
new file mode 100644
index 00000000000..542c8053339
--- /dev/null
+++ b/egs2/bn_openslr53/asr1/README.md
@@ -0,0 +1,29 @@
+# RESULTS
+## Environments
+- date: `Mon Jan 31 10:53:20 EST 2022`
+- python version: `3.9.5 (default, Jun  4 2021, 12:28:51)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.6a1`
+- pytorch version: `pytorch 1.8.1+cu102`
+- Git hash: `9d09bf551a9fe090973de60e15adec1de6b3d054`
+  - Commit date: `Fri Jan 21 11:43:15 2022 -0500`
+- Pretrained Model: https://huggingface.co/espnet/bn_openslr53
+
+## asr_train_asr_raw_bpe1000
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_batch_size1_lm_lm_train_lm_bpe1000_valid.loss.ave_asr_model_valid.acc.best/sbn_test|2018|6470|74.2|21.3|4.5|2.2|28.0|48.8|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_batch_size1_lm_lm_train_lm_bpe1000_valid.loss.ave_asr_model_valid.acc.best/sbn_test|2018|39196|89.4|4.3|6.3|1.4|12.0|48.8|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_batch_size1_lm_lm_train_lm_bpe1000_valid.loss.ave_asr_model_valid.acc.best/sbn_test|2018|15595|77.6|12.7|9.7|1.6|24.0|48.7|
+
diff --git a/egs2/bn_openslr53/asr1/asr.sh b/egs2/bn_openslr53/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/bn_openslr53/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/bn_openslr53/asr1/cmd.sh b/egs2/bn_openslr53/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/bn_openslr53/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/bn_openslr53/asr1/conf/decode_asr.yaml b/egs2/bn_openslr53/asr1/conf/decode_asr.yaml
new file mode 100644
index 00000000000..9c03d913dad
--- /dev/null
+++ b/egs2/bn_openslr53/asr1/conf/decode_asr.yaml
@@ -0,0 +1,7 @@
+batch_size: 1
+beam_size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.5
+lm_weight: 0.3
\ No newline at end of file
diff --git a/egs2/bn_openslr53/asr1/conf/fbank.conf b/egs2/bn_openslr53/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/bn_openslr53/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/bn_openslr53/asr1/conf/pbs.conf b/egs2/bn_openslr53/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/bn_openslr53/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/bn_openslr53/asr1/conf/pitch.conf b/egs2/bn_openslr53/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/bn_openslr53/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/bn_openslr53/asr1/conf/queue.conf b/egs2/bn_openslr53/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/bn_openslr53/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/bn_openslr53/asr1/conf/slurm.conf b/egs2/bn_openslr53/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/bn_openslr53/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/bn_openslr53/asr1/conf/train_asr.yaml b/egs2/bn_openslr53/asr1/conf/train_asr.yaml
new file mode 100644
index 00000000000..e375e1216e7
--- /dev/null
+++ b/egs2/bn_openslr53/asr1/conf/train_asr.yaml
@@ -0,0 +1,51 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: "conv2d"
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+
+# decoder related
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: "embed"
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# minibatch related
+batch_type: numel
+batch_bins: 200000
+
+# optimization related
+optim: adam
+accum_grad: 20
+grad_clip: 5
+patience: 20
+max_epoch: 50
+optim_conf:
+    lr: 10
+scheduler: noamlr
+scheduler_conf:
+    warmup_steps: 25000
+
+# others:
+best_model_criterion:
+-  - valid
+   - acc
+   - max
+keep_nbest_models: 10
+
+init: chainer
diff --git a/egs2/bn_openslr53/asr1/conf/train_lm.yaml b/egs2/bn_openslr53/asr1/conf/train_lm.yaml
new file mode 100644
index 00000000000..439d1f72c3b
--- /dev/null
+++ b/egs2/bn_openslr53/asr1/conf/train_lm.yaml
@@ -0,0 +1,14 @@
+lm_conf:
+    nlayers: 2
+    unit: 650
+optim: sgd        # or adam
+batch_type: folded
+batch_size: 64   # batch size in LM training
+max_epoch: 20      # if the data size is large, we can reduce this
+patience: 3
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
\ No newline at end of file
diff --git a/egs2/bn_openslr53/asr1/db.sh b/egs2/bn_openslr53/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/bn_openslr53/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/bn_openslr53/asr1/local/data.sh b/egs2/bn_openslr53/asr1/local/data.sh
new file mode 100755
index 00000000000..a06e280c22e
--- /dev/null
+++ b/egs2/bn_openslr53/asr1/local/data.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+# general configuration
+stage=0       # start from 0 if you need to start from data preparation
+stop_stage=1
+# inclusive, was 100
+SECONDS=0
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+. utils/parse_options.sh
+
+log "data preparation started"
+mkdir -p ${BENGALI}
+if [ -z "${BENGALI}" ]; then
+    log "Fill the value of 'BENGALI' of db.sh"
+    exit 1
+fi
+
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "sub-stage 0: Download Data to downloads"
+
+    idxs=("1" "2" "3" "4" "5" "6" "7" "8" "9" "a" "b" "c" "d" "e" "f")
+    for i in "${idxs[@]}"; do
+        wget -O ${BENGALI} https://us.openslr.org/resources/53/asr_bengali_${i}.zip
+        unzip -o asr_bengali_${i}.zip
+        rm -f asr_bengali_${i}.zip
+    done 
+    rm -rf asr_bengali
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "sub-stage 1: Preparing Data for openslr"
+
+    python3 local/data_prep.py -d ${BENGALI}
+    utils/spk2utt_to_utt2spk.pl data/bn_train/spk2utt > data/bn_train/utt2spk
+    utils/spk2utt_to_utt2spk.pl data/bn_dev/spk2utt > data/bn_dev/utt2spk
+    utils/spk2utt_to_utt2spk.pl data/bn_test/spk2utt > data/bn_test/utt2spk
+    utils/fix_data_dir.sh data/bn_train
+    utils/fix_data_dir.sh data/bn_dev
+    utils/fix_data_dir.sh data/bn_test
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
\ No newline at end of file
diff --git a/egs2/bn_openslr53/asr1/local/data_prep.py b/egs2/bn_openslr53/asr1/local/data_prep.py
new file mode 100644
index 00000000000..4cb5a47596b
--- /dev/null
+++ b/egs2/bn_openslr53/asr1/local/data_prep.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+
+# Copyright 2021 Carnegie Mellon University (Peter Wu)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+
+import argparse
+import os
+import random
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-d", help="downloads directory", type=str, default="downloads")
+    args = parser.parse_args()
+
+    tsv_path = "%s/utt_spk_text.tsv" % args.d
+
+    with open(tsv_path, "r") as inf:
+        tsv_lines = inf.readlines()
+    tsv_lines = [line.strip() for line in tsv_lines]
+
+    spk2utt = {}
+    utt2text = {}
+    for line in tsv_lines:
+        l_list = line.split("\t")
+        fid = l_list[0]
+        spk = l_list[1]
+        text = l_list[2]
+        path = "%s/data/%s/%s.flac" % (args.d, fid[:2], fid)
+        if os.path.exists(path):
+            utt2text[fid] = text
+            if spk in spk2utt:
+                spk2utt[spk].append(fid)
+            else:
+                spk2utt[spk] = [fid]
+
+    spks = sorted(list(spk2utt.keys()))
+    num_fids = 0
+    num_test_spks = 0
+    for spk in spks:
+        num_test_spks += 1
+        fids = sorted(list(set(spk2utt[spk])))
+        num_fids += len(fids)
+        if num_fids >= 2000:
+            break
+    test_spks = spks[:num_test_spks]
+    train_dev_spks = spks[num_test_spks:]
+    random.Random(0).shuffle(train_dev_spks)
+    num_train = int(len(train_dev_spks) * 0.9)
+    train_spks = train_dev_spks[:num_train]
+    dev_spks = train_dev_spks[num_train:]
+
+    spks_by_phase = {"train": train_spks, "dev": dev_spks, "test": test_spks}
+    flac_dir = "%s/data" % args.d
+    sr = 16000
+    for phase in spks_by_phase:
+        spks = spks_by_phase[phase]
+        text_strs = []
+        wav_scp_strs = []
+        spk2utt_strs = []
+        num_fids = 0
+        for spk in spks:
+            fids = sorted(list(set(spk2utt[spk])))
+            num_fids += len(fids)
+            if phase == "test" and num_fids > 2000:
+                curr_num_fids = num_fids - 2000
+                random.Random(1).shuffle(fids)
+                fids = fids[:curr_num_fids]
+            utts = [spk + "-" + f for f in fids]
+            utts_str = " ".join(utts)
+            spk2utt_strs.append("%s %s" % (spk, utts_str))
+            for fid, utt in zip(fids, utts):
+                cmd = "ffmpeg -i %s/%s/%s.flac -f wav -ar %d -ab 16 -ac 1 - |" % (
+                    flac_dir,
+                    fid[:2],
+                    fid,
+                    sr,
+                )
+                text_strs.append("%s %s" % (utt, utt2text[fid]))
+                wav_scp_strs.append("%s %s" % (utt, cmd))
+        phase_dir = "data/java_%s" % phase
+        if not os.path.exists(phase_dir):
+            os.makedirs(phase_dir)
+        text_strs = sorted(text_strs)
+        wav_scp_strs = sorted(wav_scp_strs)
+        spk2utt_strs = sorted(spk2utt_strs)
+        with open(os.path.join(phase_dir, "text"), "w+") as ouf:
+            for s in text_strs:
+                ouf.write("%s\n" % s)
+        with open(os.path.join(phase_dir, "wav.scp"), "w+") as ouf:
+            for s in wav_scp_strs:
+                ouf.write("%s\n" % s)
+        with open(os.path.join(phase_dir, "spk2utt"), "w+") as ouf:
+            for s in spk2utt_strs:
+                ouf.write("%s\n" % s)
diff --git a/egs2/bn_openslr53/asr1/local/path.sh b/egs2/bn_openslr53/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/bn_openslr53/asr1/path.sh b/egs2/bn_openslr53/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/bn_openslr53/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/bn_openslr53/asr1/pyscripts b/egs2/bn_openslr53/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/bn_openslr53/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/bn_openslr53/asr1/run.sh b/egs2/bn_openslr53/asr1/run.sh
new file mode 100755
index 00000000000..300a4851519
--- /dev/null
+++ b/egs2/bn_openslr53/asr1/run.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+lid=false # whether to use language id as additional label
+
+train_set="sbn_train"
+train_dev="sbn_dev"
+test_set="sbn_test"
+
+asr_config=conf/train_asr.yaml
+inference_config=conf/decode_asr.yaml
+lm_config=conf/train_lm.yaml 
+ngpu=1
+
+./asr.sh \
+    --stage 1 \
+    --stop_stage 100 \
+    --ngpu ${ngpu} \
+    --nj 80 \
+    --inference_nj 256 \
+    --inference_asr_model valid.acc.best.pth \
+    --gpu_inference false \
+    --inference_args "--batch_size 1" \
+    --use_lm true \
+    --token_type bpe \
+    --nbpe 1000 \
+    --feats_type raw \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --lm_config "${lm_config}"\
+    --train_set "${train_set}" \
+    --valid_set "${train_dev}" \
+    --test_sets "${test_set}" \
+    --lm_train_text "data/${train_set}/text" \
+    --lm_dev_text "data/${train_dev}/text" \
+    --lm_test_text "data/${test_set}/text" \
+    --local_score_opts "--score_lang_id ${lid}" "$@"
\ No newline at end of file
diff --git a/egs2/bn_openslr53/asr1/scripts b/egs2/bn_openslr53/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/bn_openslr53/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/bn_openslr53/asr1/steps b/egs2/bn_openslr53/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/bn_openslr53/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/bn_openslr53/asr1/utils b/egs2/bn_openslr53/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/bn_openslr53/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/bur_openslr80/asr1/README.md b/egs2/bur_openslr80/asr1/README.md
new file mode 100644
index 00000000000..fdbdfc565f9
--- /dev/null
+++ b/egs2/bur_openslr80/asr1/README.md
@@ -0,0 +1,30 @@
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+# RESULTS
+## Environments
+- date: `Mon Mar 21 22:59:35 UTC 2022`
+- python version: `3.9.7 (default, Sep 16 2021, 13:09:58)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.10.1`
+- Git hash: `7ae4efd81778436a98b822483e8123adba6aa430`
+  - Commit date: `Tue Mar 15 20:11:18 2022 -0400`
+
+## asr_train_asr_hubert_transformer_adam_specaug_raw_bpe150
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_batch_size1_lm_lm_train_lm_bpe150_valid.loss.ave_asr_model_valid.acc.best/bur_test|480|4227|39.1|50.4|10.5|6.1|67.0|99.8|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_batch_size1_lm_lm_train_lm_bpe150_valid.loss.ave_asr_model_valid.acc.best/bur_test|480|33345|82.2|7.6|10.1|3.6|21.4|99.8|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_batch_size1_lm_lm_train_lm_bpe150_valid.loss.ave_asr_model_valid.acc.best/bur_test|480|18237|70.7|17.7|11.6|2.5|31.8|99.8|
+
+Link to model: https://huggingface.co/espnet/bur_openslr80_hubert
diff --git a/egs2/bur_openslr80/asr1/asr.sh b/egs2/bur_openslr80/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/bur_openslr80/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/bur_openslr80/asr1/cmd.sh b/egs2/bur_openslr80/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/bur_openslr80/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/bur_openslr80/asr1/conf/decode_asr.yaml b/egs2/bur_openslr80/asr1/conf/decode_asr.yaml
new file mode 100644
index 00000000000..9c03d913dad
--- /dev/null
+++ b/egs2/bur_openslr80/asr1/conf/decode_asr.yaml
@@ -0,0 +1,7 @@
+batch_size: 1
+beam_size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.5
+lm_weight: 0.3
\ No newline at end of file
diff --git a/egs2/bur_openslr80/asr1/conf/fbank.conf b/egs2/bur_openslr80/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/bur_openslr80/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/bur_openslr80/asr1/conf/pbs.conf b/egs2/bur_openslr80/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/bur_openslr80/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/bur_openslr80/asr1/conf/pitch.conf b/egs2/bur_openslr80/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/bur_openslr80/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/bur_openslr80/asr1/conf/queue.conf b/egs2/bur_openslr80/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/bur_openslr80/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/bur_openslr80/asr1/conf/slurm.conf b/egs2/bur_openslr80/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/bur_openslr80/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/bur_openslr80/asr1/conf/train_asr.yaml b/egs2/bur_openslr80/asr1/conf/train_asr.yaml
new file mode 100644
index 00000000000..e0d47c943f2
--- /dev/null
+++ b/egs2/bur_openslr80/asr1/conf/train_asr.yaml
@@ -0,0 +1,51 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: "conv2d"
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+
+# decoder related
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: "embed"
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# minibatch related
+batch_type: numel
+batch_bins: 200000
+
+# optimization related
+optim: adam
+accum_grad: 20
+grad_clip: 5
+patience: 20
+max_epoch: 50
+optim_conf:
+    lr: 10
+scheduler: noamlr
+scheduler_conf:
+    warmup_steps: 25000
+
+# others:
+best_model_criterion:
+-  - valid
+   - acc
+   - max
+keep_nbest_models: 10
+
+init: xavier_uniform
diff --git a/egs2/bur_openslr80/asr1/conf/train_asr_hubert_transformer_adam_specaug.yaml b/egs2/bur_openslr80/asr1/conf/train_asr_hubert_transformer_adam_specaug.yaml
new file mode 100755
index 00000000000..51376ed97c0
--- /dev/null
+++ b/egs2/bur_openslr80/asr1/conf/train_asr_hubert_transformer_adam_specaug.yaml
@@ -0,0 +1,74 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+optim: adam
+optim_conf:
+    lr: 0.0002
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
+max_epoch: 200
+keep_nbest_models: 5
+
+freeze_param: [
+"frontend.upstream"
+]
+
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+frontend: s3prl
+frontend_conf:
+    frontend_conf:
+        upstream: hubert_large_ll60k  # Note: If the upstream is changed, please change the input_size in the preencoder.
+    download_dir: ./hub
+    multilayer_feature: True
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 80
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/bur_openslr80/asr1/conf/train_lm.yaml b/egs2/bur_openslr80/asr1/conf/train_lm.yaml
new file mode 100644
index 00000000000..439d1f72c3b
--- /dev/null
+++ b/egs2/bur_openslr80/asr1/conf/train_lm.yaml
@@ -0,0 +1,14 @@
+lm_conf:
+    nlayers: 2
+    unit: 650
+optim: sgd        # or adam
+batch_type: folded
+batch_size: 64   # batch size in LM training
+max_epoch: 20      # if the data size is large, we can reduce this
+patience: 3
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
\ No newline at end of file
diff --git a/egs2/bur_openslr80/asr1/db.sh b/egs2/bur_openslr80/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/bur_openslr80/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/bur_openslr80/asr1/local/data.sh b/egs2/bur_openslr80/asr1/local/data.sh
new file mode 100755
index 00000000000..6615661767f
--- /dev/null
+++ b/egs2/bur_openslr80/asr1/local/data.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+# general configuration
+stage=0       # start from 0 if you need to start from data preparation
+stop_stage=1
+# inclusive, was 100
+SECONDS=0
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+. utils/parse_options.sh
+
+log "data preparation started"
+mkdir -p ${BURMESE}
+if [ -z "${BURMESE}" ]; then
+    log "Fill the value of 'BURMESE' of db.sh"
+    exit 1
+fi
+
+workspace=$PWD
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "sub-stage 0: Download Data to downloads"
+    cd ${BURMESE}
+    wget https://us.openslr.org/resources/80/my_mm_female.zip
+    unzip -o my_mm_female.zip
+    rm -f my_mm_female.zip
+
+    cd $workspace
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "sub-stage 1: Preparing Data for openslr"
+
+    python3 local/data_prep.py -d ${BURMESE}
+    utils/spk2utt_to_utt2spk.pl data/bur_train/spk2utt > data/bur_train/utt2spk
+    utils/spk2utt_to_utt2spk.pl data/bur_dev/spk2utt > data/bur_dev/utt2spk
+    utils/spk2utt_to_utt2spk.pl data/bur_test/spk2utt > data/bur_test/utt2spk
+    utils/fix_data_dir.sh data/bur_train
+    utils/fix_data_dir.sh data/bur_dev
+    utils/fix_data_dir.sh data/bur_test
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/bur_openslr80/asr1/local/data_prep.py b/egs2/bur_openslr80/asr1/local/data_prep.py
new file mode 100644
index 00000000000..98180ea4b2e
--- /dev/null
+++ b/egs2/bur_openslr80/asr1/local/data_prep.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python3
+
+# Copyright 2021 Carnegie Mellon University (Peter Wu)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+
+import argparse
+import os
+import random
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-d", help="downloads directory", type=str, default="downloads")
+    args = parser.parse_args()
+
+    tsv_path = "%s/line_index.tsv" % args.d
+
+    with open(tsv_path, "r") as inf:
+        tsv_lines = inf.readlines()
+    tsv_lines = [line.strip() for line in tsv_lines]
+
+    spk2utt = {}
+    utt2text = {}
+    for line in tsv_lines:
+        l_list = line.split("\t")
+        fid = l_list[0]
+        spk = fid[:8]
+        text = l_list[1]
+        path = "%s/%s.wav" % (args.d, fid)
+        if os.path.exists(path):
+            utt2text[fid] = text
+            if spk in spk2utt:
+                spk2utt[spk].append(fid)
+            else:
+                spk2utt[spk] = [fid]
+
+    spks = sorted(list(spk2utt.keys()))
+    print(spks)
+    num_fids = 0
+    num_test_spks = 0
+    for spk in spks:
+        num_test_spks += 1
+        fids = sorted(list(set(spk2utt[spk])))
+        num_fids += len(fids)
+        if num_fids >= 500:
+            break
+    test_spks = spks[:num_test_spks]
+    train_dev_spks = spks[num_test_spks:]
+    random.Random(0).shuffle(train_dev_spks)
+    num_train = int(len(train_dev_spks) * 0.9)
+    train_spks = train_dev_spks[:num_train]
+    dev_spks = train_dev_spks[num_train:]
+
+    spks_by_phase = {"train": train_spks, "dev": dev_spks, "test": test_spks}
+    flac_dir = "%s" % args.d
+    sr = 16000
+    for phase in spks_by_phase:
+        spks = spks_by_phase[phase]
+        if type(spks) is not list:
+            spks = [spks]
+        text_strs = []
+        wav_scp_strs = []
+        spk2utt_strs = []
+        num_fids = 0
+        for spk in spks:
+            fids = sorted(list(set(spk2utt[spk])))
+            num_fids += len(fids)
+            if phase == "test" and num_fids > 500:
+                curr_num_fids = num_fids - 500
+                random.Random(1).shuffle(fids)
+                fids = fids[:curr_num_fids]
+            utts = [spk + "-" + f for f in fids]
+            utts_str = " ".join(utts)
+            spk2utt_strs.append("%s %s" % (spk, utts_str))
+            for fid, utt in zip(fids, utts):
+                cmd = "ffmpeg -i %s/%s.wav -f wav -ar %d -ab 16 -ac 1 - |" % (
+                    flac_dir,
+                    fid,
+                    sr,
+                )
+                text_strs.append("%s %s" % (utt, utt2text[fid]))
+                wav_scp_strs.append("%s %s" % (utt, cmd))
+        phase_dir = "data/bur_%s" % phase
+        if not os.path.exists(phase_dir):
+            os.makedirs(phase_dir)
+        text_strs = sorted(text_strs)
+        wav_scp_strs = sorted(wav_scp_strs)
+        spk2utt_strs = sorted(spk2utt_strs)
+        with open(os.path.join(phase_dir, "text"), "w+") as ouf:
+            for s in text_strs:
+                ouf.write("%s\n" % s)
+        with open(os.path.join(phase_dir, "wav.scp"), "w+") as ouf:
+            for s in wav_scp_strs:
+                ouf.write("%s\n" % s)
+        with open(os.path.join(phase_dir, "spk2utt"), "w+") as ouf:
+            for s in spk2utt_strs:
+                ouf.write("%s\n" % s)
diff --git a/egs2/bur_openslr80/asr1/local/path.sh b/egs2/bur_openslr80/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/bur_openslr80/asr1/path.sh b/egs2/bur_openslr80/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/bur_openslr80/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/bur_openslr80/asr1/pyscripts b/egs2/bur_openslr80/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/bur_openslr80/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/bur_openslr80/asr1/run.sh b/egs2/bur_openslr80/asr1/run.sh
new file mode 100755
index 00000000000..970c3594294
--- /dev/null
+++ b/egs2/bur_openslr80/asr1/run.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+lid=false # whether to use language id as additional label
+
+train_set="bur_train"
+train_dev="bur_dev"
+test_set="bur_test"
+
+asr_config=conf/train_asr_hubert_transformer_adam_specaug.yaml
+inference_config=conf/decode_asr.yaml
+lm_config=conf/train_lm.yaml 
+ngpu=1
+
+./asr.sh \
+    --stage 1 \
+    --stop_stage 100 \
+    --ngpu ${ngpu} \
+    --nj 80 \
+    --inference_nj 256 \
+    --inference_asr_model valid.acc.best.pth \
+    --gpu_inference false \
+    --inference_args "--batch_size 1" \
+    --use_lm true \
+    --token_type bpe \
+    --nbpe 150 \
+    --feats_type raw \
+    --feats_normalize uttmvn \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --lm_config "${lm_config}"\
+    --train_set "${train_set}" \
+    --valid_set "${train_dev}" \
+    --test_sets "${test_set}" \
+    --lm_train_text "data/${train_set}/text" \
+    --lm_dev_text "data/${train_dev}/text" \
+    --lm_test_text "data/${test_set}/text" \
+    --local_score_opts "--score_lang_id ${lid}" "$@"
diff --git a/egs2/bur_openslr80/asr1/scripts b/egs2/bur_openslr80/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/bur_openslr80/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/bur_openslr80/asr1/steps b/egs2/bur_openslr80/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/bur_openslr80/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/bur_openslr80/asr1/utils b/egs2/bur_openslr80/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/bur_openslr80/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/chime4/asr1/conf/tuning/train_asr_transformer_wavlm_lr1e-3_specaug_accum1_preenc128_warmup20k.yaml b/egs2/chime4/asr1/conf/tuning/train_asr_transformer_wavlm_lr1e-3_specaug_accum1_preenc128_warmup20k.yaml
new file mode 100644
index 00000000000..cee2e0c896d
--- /dev/null
+++ b/egs2/chime4/asr1/conf/tuning/train_asr_transformer_wavlm_lr1e-3_specaug_accum1_preenc128_warmup20k.yaml
@@ -0,0 +1,90 @@
+# minibatch related
+batch_type: folded
+batch_size: 32
+accum_grad: 1
+grad_clip: 5
+max_epoch: 50
+patience: none
+# The initialization method for model parameters
+init: xavier_uniform
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+unused_parameters: true
+# SSL-based frontend is fixed during training for training efficiency,
+# however, the gradients are backprogated through frontend to the enhancement.
+freeze_param: [
+    "frontend.upstream"
+]
+
+# network architecture
+frontend: s3prl
+frontend_conf:
+    frontend_conf:
+        upstream: wavlm_large  # Note: If the upstream is changed, please change the input_size in the preencoder.
+    download_dir: ./hub
+    multilayer_feature: True
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 128
+
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d2
+    normalize_before: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: embed
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.0
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false
+
+optim: adam
+optim_conf:
+    lr: 0.001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 20000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 100
+    num_freq_mask: 4
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
diff --git a/egs2/chime4/enh1/README.md b/egs2/chime4/enh1/README.md
index 9ca905d08cd..886eb0cbf26 100644
--- a/egs2/chime4/enh1/README.md
+++ b/egs2/chime4/enh1/README.md
@@ -6,6 +6,7 @@
 - python version: `3.6.3 |Anaconda, Inc.| (default, Nov 20 2017, 20:41:42)  [GCC 7.2.0]`
 - espnet version: `espnet 0.9.7`
 - pytorch version: `pytorch 1.6.0`
+- Note: PESQ is evaluated based on https://github.com/vBaiCai/python-pesq
 
 
 ## enh_train_enh_conv_tasnet_raw
@@ -25,3 +26,36 @@ config: conf/tuning/train_enh_beamformer_mvdr.yaml
 |---|---|---|---|---|---|---|
 |enhanced_dt05_simu_isolated_6ch_track|2.60|0.94|13.67|13.67|0|12.51|
 |enhanced_et05_simu_isolated_6ch_track|2.63|0.95|15.51|15.51|0|14.65|
+
+<!-- These results are from the code after refactoring  -->
+## enh_train_enh_dc_crn_mapping_snr_raw
+
+config: conf/tuning/train_enh_dc_crn_mapping_snr.yaml
+
+|dataset|PESQ|STOI|SAR|SDR|SIR|SI_SNR|
+|---|---|---|---|---|---|---|
+|enhanced_dt05_simu_isolated_6ch_track|3.10|0.96|17.82|17.82|0.00|17.59|
+|enhanced_et05_simu_isolated_6ch_track|2.95|0.95|17.33|17.33|0.00|17.04|
+
+<!-- Generated by ./scripts/utils/show_enh_score.sh -->
+# RESULTS
+## Environments
+- date: `Sat Mar 19 07:17:45 CST 2022`
+- python version: `3.7.11 (default, Jul 27 2021, 14:32:16)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.8.1`
+- Git hash: `648b024d8fb262eb9923c06a698b9c6df5b16e51`
+  - Commit date: `Wed Mar 16 18:47:21 2022 +0800`
+
+
+## enh_train_enh_dprnntac_fasnet_raw
+
+config: conf/tuning/train_enh_dprnntac_fasnet.yaml
+
+Pretrained model: https://huggingface.co/lichenda/chime4_fasnet_dprnn_tac
+
+|dataset|STOI|SAR|SDR|SIR|
+|---|---|---|---|---|
+|enhanced_dt05_simu_isolated_6ch_track|0.95|15.75|15.75|0.00|
+|enhanced_et05_simu_isolated_6ch_track|0.94|15.40|15.40|0.00|
+
diff --git a/egs2/chime4/enh1/conf/tuning/train_enh_beamformer_mvdr.yaml b/egs2/chime4/enh1/conf/tuning/train_enh_beamformer_mvdr.yaml
index fc996552cd3..cee051c8ef1 100644
--- a/egs2/chime4/enh1/conf/tuning/train_enh_beamformer_mvdr.yaml
+++ b/egs2/chime4/enh1/conf/tuning/train_enh_beamformer_mvdr.yaml
@@ -53,7 +53,7 @@ separator_conf:
     bunits: 512
     bprojs: 512
     badim: 320
-    ref_channel: 4
+    ref_channel: 3
     use_noise_mask: True
     beamformer_type: mvdr_souden
     bdropout_rate: 0.0
diff --git a/egs2/chime4/enh1/conf/tuning/train_enh_convtasnet_small.yaml b/egs2/chime4/enh1/conf/tuning/train_enh_convtasnet_small.yaml
new file mode 100644
index 00000000000..7c73d4c868f
--- /dev/null
+++ b/egs2/chime4/enh1/conf/tuning/train_enh_convtasnet_small.yaml
@@ -0,0 +1,64 @@
+optim: adam
+init: xavier_uniform
+max_epoch: 100
+batch_type: folded
+batch_size: 32
+iterator_type: chunk
+chunk_length: 32000
+num_workers: 4
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 1.0e-05
+patience: 4
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: reducelronplateau
+scheduler_conf:
+    mode: min
+    factor: 0.5
+    patience: 3
+model_conf:
+    loss_type: si_snr
+encoder: conv
+encoder_conf:
+    channel: 256
+    kernel_size: 40
+    stride: 20
+decoder: conv
+decoder_conf:
+    channel: 256
+    kernel_size: 40
+    stride: 20
+separator: tcn
+separator_conf:
+    num_spk: 1
+    layer: 4
+    stack: 2
+    bottleneck_dim: 256
+    hidden_dim: 512
+    kernel: 3
+    causal: False
+    norm_type: "gLN"
+    nonlinear: relu
+criterions:
+  # The first criterion
+  - name: si_snr
+    conf:
+        eps: 1e-7
+    # the wrapper for the current criterion
+    # for single-talker case, we simplely use fixed_order wrapper
+    wrapper:
+      - type: fixed_order
+        wrapper_conf:
+          weight: 1.0
+
diff --git a/egs2/chime4/enh1/conf/tuning/train_enh_dc_crn_mapping_snr.yaml b/egs2/chime4/enh1/conf/tuning/train_enh_dc_crn_mapping_snr.yaml
new file mode 100644
index 00000000000..38d61843282
--- /dev/null
+++ b/egs2/chime4/enh1/conf/tuning/train_enh_dc_crn_mapping_snr.yaml
@@ -0,0 +1,67 @@
+init: xavier_uniform
+max_epoch: 200
+batch_type: folded
+batch_size:  16
+iterator_type: chunk
+chunk_length: 32000
+num_workers: 4
+optim: adam
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 1.0e-7
+    amsgrad: true
+patience: 10
+grad_clip: 5
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: steplr
+scheduler_conf:
+    step_size: 2
+    gamma: 0.98
+
+# A list for criterions
+# The overlall loss in the multi-task learning will be:
+# loss = weight_1 * loss_1 + ... + weight_N * loss_N
+# The default `weight` for each sub-loss is 1.0
+criterions: 
+  # The first criterion
+  - name: snr 
+    conf:
+      eps: 1.0e-7
+    # the wrapper for the current criterion
+    # PIT is widely used in the speech separation task
+    wrapper: pit
+    wrapper_conf:
+      weight: 1.0
+
+
+encoder: stft
+encoder_conf:
+    n_fft: 256
+    hop_length: 128
+decoder: stft
+decoder_conf:
+    n_fft: 256
+    hop_length: 128
+separator: dc_crn
+separator_conf:
+    num_spk: 1
+    input_channels: [10, 16, 32, 64, 128, 256]  # 5x2=10 input channels
+    enc_hid_channels: 8
+    enc_layers: 5
+    glstm_groups: 2
+    glstm_layers: 2
+    glstm_bidirectional: true
+    glstm_rearrange: false
+    mode: mapping
+    ref_channel: 3
diff --git a/egs2/chime4/enh1/conf/tuning/train_enh_dprnntac_fasnet.yaml b/egs2/chime4/enh1/conf/tuning/train_enh_dprnntac_fasnet.yaml
new file mode 100644
index 00000000000..b5dd47ddac7
--- /dev/null
+++ b/egs2/chime4/enh1/conf/tuning/train_enh_dprnntac_fasnet.yaml
@@ -0,0 +1,59 @@
+optim: adam
+init: xavier_uniform
+max_epoch: 100
+batch_type: folded
+batch_size: 8 
+iterator_type: chunk
+chunk_length: 32000
+num_workers: 4
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 0
+patience: 10
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: steplr
+scheduler_conf:
+    step_size: 2
+    gamma: 0.98
+
+encoder: same
+encoder_conf: {}
+decoder: same
+decoder_conf: {}
+separator: fasnet
+separator_conf:
+    enc_dim: 64
+    feature_dim: 64
+    hidden_dim: 128
+    layer: 6
+    segment_size: 24
+    num_spk: 1
+    win_len: 16
+    context_len: 16
+    sr: 16000
+    fasnet_type: 'fasnet'
+    dropout: 0.2
+
+
+
+criterions: 
+  # The first criterion
+  - name: si_snr 
+    conf:
+      eps: 1.0e-7
+    # the wrapper for the current criterion
+    # for single-talker case, we simplely use fixed_order wrapper
+    wrapper: fixed_order
+    wrapper_conf:
+      weight: 1.0
diff --git a/egs2/chime4/enh1/conf/tuning/train_enh_dprnntac_ifasnet.yaml b/egs2/chime4/enh1/conf/tuning/train_enh_dprnntac_ifasnet.yaml
new file mode 100644
index 00000000000..ef1349ad8b9
--- /dev/null
+++ b/egs2/chime4/enh1/conf/tuning/train_enh_dprnntac_ifasnet.yaml
@@ -0,0 +1,58 @@
+optim: adam
+init: xavier_uniform
+max_epoch: 100
+batch_type: folded
+batch_size: 8 
+iterator_type: chunk
+chunk_length: 32000
+num_workers: 4
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 0
+patience: 10
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: steplr
+scheduler_conf:
+    step_size: 2
+    gamma: 0.98
+
+encoder: same
+encoder_conf: {}
+decoder: same
+decoder_conf: {}
+separator: fasnet
+separator_conf:
+    enc_dim: 64
+    feature_dim: 64
+    hidden_dim: 128
+    layer: 6
+    segment_size: 24
+    num_spk: 1
+    win_len: 16
+    context_len: 16
+    sr: 16000
+    fasnet_type: 'ifasnet'
+
+
+
+criterions: 
+  # The first criterion
+  - name: si_snr 
+    conf:
+      eps: 1.0e-7
+    # the wrapper for the current criterion
+    # for single-talker case, we simplely use fixed_order wrapper
+    wrapper: fixed_order
+    wrapper_conf:
+      weight: 1.0
diff --git a/egs2/chime4/enh1/local/simu_ext_chime4_data_prep.sh b/egs2/chime4/enh1/local/simu_ext_chime4_data_prep.sh
index 08df7d0dc4c..5cd50773aeb 100755
--- a/egs2/chime4/enh1/local/simu_ext_chime4_data_prep.sh
+++ b/egs2/chime4/enh1/local/simu_ext_chime4_data_prep.sh
@@ -85,6 +85,8 @@ elif [[ "$track" == "6" ]]; then
   done
 
   for x in $list_set; do
+    # drop the second channel to follow the convention in CHiME-4
+    # see P27 in https://hal.inria.fr/hal-01399180/file/vincent_CSL16.pdf
     mix-mono-wav-scp.py ${x}_wav.CH{1,3,4,5,6}.scp > ${x}_wav.scp
     mix-mono-wav-scp.py ${x}_spk1_wav.CH{1,3,4,5,6}.scp > ${x}_spk1_wav.scp
     sed -E "s#\.Clean\.wav#\.Noise\.wav#g" ${x}_spk1_wav.scp > ${x}_noise_wav.scp
diff --git a/egs2/chime4/enh1/run.sh b/egs2/chime4/enh1/run.sh
index cf95ee85954..60ee54ec435 100755
--- a/egs2/chime4/enh1/run.sh
+++ b/egs2/chime4/enh1/run.sh
@@ -25,7 +25,7 @@ test_sets="et05_simu_isolated_1ch_track"
     --fs ${sample_rate} \
     --ngpu 2 \
     --spk_num 1 \
-    --ref_channel 4 \
+    --ref_channel 3 \
     --local_data_opts "--extra-annotations ${extra_annotations} --stage 1 --stop-stage 2" \
     --enh_config conf/tuning/train_enh_conv_tasnet.yaml \
     --use_dereverb_ref false \
diff --git a/egs2/chime4/enh_asr1/README.md b/egs2/chime4/enh_asr1/README.md
new file mode 100644
index 00000000000..f01c087f211
--- /dev/null
+++ b/egs2/chime4/enh_asr1/README.md
@@ -0,0 +1,97 @@
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+# RESULTS
+## Environments
+- date: `Thu Apr 28 00:09:17 EDT 2022`
+- python version: `3.7.11 (default, Jul 27 2021, 14:32:16)  [GCC 7.5.0]`
+- espnet version: `espnet 202204`
+- pytorch version: `pytorch 1.8.1`
+- Git hash: `44971ff962aae30c962226f1ba3d87de057ac00e`
+  - Commit date: `Wed Apr 27 10:13:03 2022 -0400`
+
+## enh_asr_train_enh_asr_convtasnet_init_noenhloss_wavlm_transformer_init_lr1e-4_accum1_adam_specaug_bypass0_raw_en_char
+- Pretrained model: https://huggingface.co/espnet/simpleoier_chime4_enh_asr_convtasnet_init_noenhloss_wavlm_transformer_init_raw_en_char
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/dt05_real_isolated_1ch_track|1640|27119|98.3|1.3|0.4|0.2|1.9|21.8|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/dt05_real_beamformit_2mics|1640|27119|98.5|1.2|0.3|0.2|1.7|19.6|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/dt05_real_beamformit_5mics|1640|27119|98.6|1.1|0.3|0.2|1.5|18.7|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/dt05_simu_isolated_1ch_track|1640|27120|97.2|2.1|0.7|0.3|3.1|28.9|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/dt05_simu_beamformit_2mics|1640|27120|97.9|1.5|0.5|0.2|2.3|25.2|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/dt05_simu_beamformit_5mics|1640|27120|98.4|1.2|0.4|0.1|1.7|19.9|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/et05_real_isolated_1ch_track|1320|21409|96.7|2.6|0.7|0.4|3.7|31.6|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/et05_real_beamformit_2mics|1320|21409|97.4|2.0|0.6|0.3|2.9|27.3|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/et05_real_beamformit_5mics|1320|21409|97.8|1.8|0.4|0.2|2.5|24.3|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/et05_simu_isolated_1ch_track|1320|21416|94.6|3.7|1.6|0.5|5.9|37.3|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/et05_simu_beamformit_2mics|1320|21416|96.6|2.5|1.0|0.3|3.7|32.5|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/et05_simu_beamformit_5mics|1320|21416|97.5|1.9|0.7|0.3|2.9|28.9|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/dt05_real_isolated_1ch_track|1640|160390|99.4|0.2|0.4|0.2|0.8|21.8|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/dt05_real_beamformit_2mics|1640|160390|99.5|0.2|0.3|0.2|0.7|19.6|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/dt05_real_beamformit_5mics|1640|160390|99.6|0.1|0.3|0.2|0.6|18.7|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/dt05_simu_isolated_1ch_track|1640|160400|98.8|0.5|0.7|0.3|1.5|28.9|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/dt05_simu_beamformit_2mics|1640|160400|99.2|0.3|0.5|0.2|1.1|25.2|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/dt05_simu_beamformit_5mics|1640|160400|99.5|0.2|0.3|0.1|0.7|19.9|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/et05_real_isolated_1ch_track|1320|126796|98.6|0.6|0.8|0.4|1.8|31.7|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/et05_real_beamformit_2mics|1320|126796|98.9|0.4|0.7|0.3|1.4|27.3|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/et05_real_beamformit_5mics|1320|126796|99.1|0.4|0.5|0.2|1.1|24.3|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/et05_simu_isolated_1ch_track|1320|126812|97.0|1.2|1.9|0.6|3.7|37.3|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/et05_simu_beamformit_2mics|1320|126812|98.2|0.6|1.1|0.4|2.1|32.5|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/et05_simu_beamformit_5mics|1320|126812|98.8|0.4|0.8|0.3|1.5|28.9|
+
+### Enhancement
+
+|dataset|STOI|SDR|SI_SNR|
+|---|---|---|---|
+|dt05_simu_isolated_1ch_track|0.86|4.97|1.77|
+|et05_simu_isolated_1ch_track|0.85|5.45|0.88|
+
+
+## enh_asr_train_enh_asr_convtasnet_fbank_transformer_raw_en_char
+- Pretrained model: https://huggingface.co/espnet/simpleoier_chime4_enh_asr_train_enh_asr_convtasnet_fbank_transformer_raw_en_char
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/dt05_real_isolated_1ch_track|1640|27119|91.8|6.0|2.2|0.8|9.0|57.7|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/dt05_real_beamformit_2mics|1640|27119|93.0|5.2|1.8|0.6|7.7|53.3|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/dt05_real_beamformit_5mics|1640|27119|93.9|4.5|1.6|0.5|6.7|49.9|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/dt05_simu_isolated_1ch_track|1640|27120|89.9|7.6|2.4|1.0|11.1|59.7|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/dt05_simu_beamformit_2mics|1640|27120|92.2|6.0|1.9|0.7|8.6|55.5|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/dt05_simu_beamformit_5mics|1640|27120|93.6|4.9|1.5|0.6|7.1|51.6|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/et05_real_isolated_1ch_track|1320|21409|84.6|11.4|4.0|1.5|17.0|69.4|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/et05_real_beamformit_2mics|1320|21409|86.7|9.7|3.5|1.3|14.5|64.7|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/et05_real_beamformit_5mics|1320|21409|89.2|7.9|2.9|1.0|11.8|61.2|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/et05_simu_isolated_1ch_track|1320|21416|82.8|13.1|4.1|1.9|19.1|69.4|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/et05_simu_beamformit_2mics|1320|21416|86.0|10.5|3.5|1.5|15.5|67.5|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/et05_simu_beamformit_5mics|1320|21416|88.1|8.9|3.1|1.2|13.1|64.8|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/dt05_real_isolated_1ch_track|1640|160390|95.9|1.7|2.3|0.8|4.8|57.7|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/dt05_real_beamformit_2mics|1640|160390|96.6|1.4|2.0|0.6|4.0|53.3|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/dt05_real_beamformit_5mics|1640|160390|97.1|1.1|1.8|0.5|3.4|49.9|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/dt05_simu_isolated_1ch_track|1640|160400|94.7|2.5|2.9|1.0|6.3|59.7|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/dt05_simu_beamformit_2mics|1640|160400|95.9|1.7|2.3|0.7|4.8|55.5|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/dt05_simu_beamformit_5mics|1640|160400|96.8|1.4|1.9|0.6|3.8|51.6|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/et05_real_isolated_1ch_track|1320|126796|91.5|3.8|4.6|1.6|10.0|69.4|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/et05_real_beamformit_2mics|1320|126796|92.8|3.2|4.0|1.2|8.4|64.7|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/et05_real_beamformit_5mics|1320|126796|94.3|2.4|3.3|1.0|6.6|61.2|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/et05_simu_isolated_1ch_track|1320|126812|90.3|4.8|4.9|2.2|11.9|69.4|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/et05_simu_beamformit_2mics|1320|126812|92.2|3.5|4.2|1.7|9.5|67.5|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/et05_simu_beamformit_5mics|1320|126812|93.7|2.7|3.5|1.4|7.7|64.8|
+
+### Enhancement
+
+|dataset|STOI|SDR|SI_SNR|
+|---|---|---|---|
+|dt05_simu_isolated_1ch_track|0.87|7.14|4.51|
+|et05_simu_isolated_1ch_track|0.85|7.47|3.02|
diff --git a/egs2/chime4/enh_asr1/cmd.sh b/egs2/chime4/enh_asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/chime4/enh_asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/chime4/enh_asr1/conf/chime4.cfg b/egs2/chime4/enh_asr1/conf/chime4.cfg
new file mode 120000
index 00000000000..5b3477ab5c6
--- /dev/null
+++ b/egs2/chime4/enh_asr1/conf/chime4.cfg
@@ -0,0 +1 @@
+../../asr1/conf/chime4.cfg
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/conf/decode_asr_transformer.yaml b/egs2/chime4/enh_asr1/conf/decode_asr_transformer.yaml
new file mode 100644
index 00000000000..8e7518150a7
--- /dev/null
+++ b/egs2/chime4/enh_asr1/conf/decode_asr_transformer.yaml
@@ -0,0 +1,7 @@
+batch_size: 0
+beam-size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc-weight: 0.3
+lm-weight: 1.0
diff --git a/egs2/chime4/enh_asr1/conf/fbank.conf b/egs2/chime4/enh_asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/chime4/enh_asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/chime4/enh_asr1/conf/pbs.conf b/egs2/chime4/enh_asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/chime4/enh_asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/chime4/enh_asr1/conf/pitch.conf b/egs2/chime4/enh_asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/chime4/enh_asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/chime4/enh_asr1/conf/queue.conf b/egs2/chime4/enh_asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/chime4/enh_asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/chime4/enh_asr1/conf/slurm.conf b/egs2/chime4/enh_asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/chime4/enh_asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/chime4/enh_asr1/conf/train_enh_asr_convtasnet_fbank_transformer.yaml b/egs2/chime4/enh_asr1/conf/train_enh_asr_convtasnet_fbank_transformer.yaml
new file mode 120000
index 00000000000..920b436ba58
--- /dev/null
+++ b/egs2/chime4/enh_asr1/conf/train_enh_asr_convtasnet_fbank_transformer.yaml
@@ -0,0 +1 @@
+tuning/train_enh_asr_convtasnet_si_snr_fbank_transformer_lr2e-3_accum2_warmup20k_specaug.yaml
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/conf/train_lm_transformer.yaml b/egs2/chime4/enh_asr1/conf/train_lm_transformer.yaml
new file mode 100644
index 00000000000..a502a55381a
--- /dev/null
+++ b/egs2/chime4/enh_asr1/conf/train_lm_transformer.yaml
@@ -0,0 +1,48 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: conv2d
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: embed
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# optimization related
+optim: adam
+accum_grad: 2
+grad_clip: 5
+patience: 10
+max_epoch: 100
+optim_conf:
+    lr: 0.005
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 20000
+
+# minibatch related
+batch_type: folded
+batch_size: 32
+
+# criterion
+best_model_criterion:
+-  - valid
+   - acc
+   - max
+keep_nbest_models: 10
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/conf/tuning/train_enh_asr_convtasnet_init_noenhloss_wavlm_transformer_init_lr1e-4_accum1_adam_specaug_bypass0.yaml b/egs2/chime4/enh_asr1/conf/tuning/train_enh_asr_convtasnet_init_noenhloss_wavlm_transformer_init_lr1e-4_accum1_adam_specaug_bypass0.yaml
new file mode 100644
index 00000000000..1eb24dd8134
--- /dev/null
+++ b/egs2/chime4/enh_asr1/conf/tuning/train_enh_asr_convtasnet_init_noenhloss_wavlm_transformer_init_lr1e-4_accum1_adam_specaug_bypass0.yaml
@@ -0,0 +1,124 @@
+# minibatch related
+batch_type: folded
+batch_size: 16  # A6000 x 1
+accum_grad: 1
+grad_clip: 5
+max_epoch: 12
+patience: 10
+# The initialization method for model parameters
+init: xavier_uniform
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+-   - train
+    - loss
+    - min
+keep_nbest_models: 10
+num_att_plot: 3
+unused_parameters: true
+freeze_param: [
+    "s2t_model.frontend.upstream",
+]
+init_param: [
+    "../enh1/exp/enh_train_enh_convtasnet_small_raw/valid.loss.ave_1best.pth:encoder:enh_model.encoder",
+    "../enh1/exp/enh_train_enh_convtasnet_small_raw/valid.loss.ave_1best.pth:separator:enh_model.separator",
+    "../enh1/exp/enh_train_enh_convtasnet_small_raw/valid.loss.ave_1best.pth:decoder:enh_model.decoder",
+    "../asr1/exp/asr_train_asr_transformer_wavlm_lr1e-3_specaug_accum1_preenc128_warmup20k_raw_en_char/valid.acc.ave.pth:frontend:s2t_model.frontend",
+    "../asr1/exp/asr_train_asr_transformer_wavlm_lr1e-3_specaug_accum1_preenc128_warmup20k_raw_en_char/valid.acc.ave.pth:preencoder:s2t_model.preencoder",
+    "../asr1/exp/asr_train_asr_transformer_wavlm_lr1e-3_specaug_accum1_preenc128_warmup20k_raw_en_char/valid.acc.ave.pth:encoder:s2t_model.encoder",
+    "../asr1/exp/asr_train_asr_transformer_wavlm_lr1e-3_specaug_accum1_preenc128_warmup20k_raw_en_char/valid.acc.ave.pth:ctc:s2t_model.ctc",
+    "../asr1/exp/asr_train_asr_transformer_wavlm_lr1e-3_specaug_accum1_preenc128_warmup20k_raw_en_char/valid.acc.ave.pth:decoder:s2t_model.decoder",
+]
+
+# network architecture
+enh_encoder: conv
+enh_encoder_conf:
+    channel: 256
+    kernel_size: 40
+    stride: 20
+enh_decoder: conv
+enh_decoder_conf:
+    channel: 256
+    kernel_size: 40
+    stride: 20
+enh_separator: tcn
+enh_separator_conf:
+    num_spk: 1
+    layer: 4
+    stack: 2
+    bottleneck_dim: 256
+    hidden_dim: 512
+    kernel: 3
+    causal: False
+    norm_type: "gLN"
+    nonlinear: relu
+
+frontend: s3prl
+frontend_conf:
+    frontend_conf:
+        upstream: wavlm_large  # Note: If the upstream is changed, please change the input_size in the preencoder.
+    download_dir: ./hub
+    multilayer_feature: true
+
+asr_preencoder: linear
+asr_preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 128
+
+# encoder related
+asr_encoder: transformer
+asr_encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d2
+    normalize_before: true
+
+# decoder related
+asr_decoder: transformer
+asr_decoder_conf:
+    input_layer: embed
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.0
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+asr_model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false
+
+model_conf:
+    calc_enh_loss: false
+    bypass_enh_prob: 0.0
+
+optim: adam
+optim_conf:
+    lr: 0.0001
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+        apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 100
+    num_freq_mask: 4
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/chime4/enh_asr1/conf/tuning/train_enh_asr_convtasnet_si_snr_fbank_transformer_lr2e-3_accum2_warmup20k_specaug.yaml b/egs2/chime4/enh_asr1/conf/tuning/train_enh_asr_convtasnet_si_snr_fbank_transformer_lr2e-3_accum2_warmup20k_specaug.yaml
new file mode 100644
index 00000000000..8e30e5edecb
--- /dev/null
+++ b/egs2/chime4/enh_asr1/conf/tuning/train_enh_asr_convtasnet_si_snr_fbank_transformer_lr2e-3_accum2_warmup20k_specaug.yaml
@@ -0,0 +1,119 @@
+# minibatch related
+batch_type: folded
+batch_size: 16  # A6000 x 1
+accum_grad: 2
+grad_clip: 5
+max_epoch: 50
+patience: 10
+# The initialization method for model parameters
+init: xavier_uniform
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+-   - train
+    - loss
+    - min
+keep_nbest_models: 10
+num_att_plot: 0
+
+# network architecture
+enh_encoder: conv
+enh_encoder_conf:
+    channel: 256
+    kernel_size: 40
+    stride: 20
+enh_decoder: conv
+enh_decoder_conf:
+    channel: 256
+    kernel_size: 40
+    stride: 20
+enh_separator: tcn
+enh_separator_conf:
+    num_spk: 1
+    layer: 4
+    stack: 2
+    bottleneck_dim: 256
+    hidden_dim: 512
+    kernel: 3
+    causal: False
+    norm_type: "gLN"
+    nonlinear: relu
+enh_criterions:
+  # The first criterion
+  - name: si_snr
+    conf:
+        eps: 1e-7
+    # the wrapper for the current criterion
+    # for single-talker case, we simplely use fixed_order wrapper
+    wrapper: fixed_order
+    wrapper_conf:
+        weight: 1.0
+
+frontend: default
+frontend_conf:
+    fs: 16000
+    n_fft: 512
+    win_length: 400
+    hop_length: 160
+    frontend_conf: null
+    apply_stft: True
+
+# encoder related
+asr_encoder: transformer
+asr_encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d
+    normalize_before: true
+
+# decoder related
+asr_decoder: transformer
+asr_decoder_conf:
+    input_layer: embed
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.0
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+asr_model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false
+
+model_conf:
+    bypass_enh_prob: 0.0
+
+optim: adam
+optim_conf:
+    lr: 0.002
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 20000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/db.sh b/egs2/chime4/enh_asr1/db.sh
new file mode 120000
index 00000000000..3090b1bc350
--- /dev/null
+++ b/egs2/chime4/enh_asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/enh_asr1/db.sh
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/enh_asr.sh b/egs2/chime4/enh_asr1/enh_asr.sh
new file mode 120000
index 00000000000..b00d9b13ef7
--- /dev/null
+++ b/egs2/chime4/enh_asr1/enh_asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/enh_asr1/enh_asr.sh
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/CHiME3_simulate_data_patched_parallel.m b/egs2/chime4/enh_asr1/local/CHiME3_simulate_data_patched_parallel.m
new file mode 120000
index 00000000000..8f939c2e007
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/CHiME3_simulate_data_patched_parallel.m
@@ -0,0 +1 @@
+../../enh1/local/CHiME3_simulate_data_patched_parallel.m
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/bth_chime4_data_prep.sh b/egs2/chime4/enh_asr1/local/bth_chime4_data_prep.sh
new file mode 120000
index 00000000000..f94db52c974
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/bth_chime4_data_prep.sh
@@ -0,0 +1 @@
+../../asr1/local/bth_chime4_data_prep.sh
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/chime4_asr_data.sh b/egs2/chime4/enh_asr1/local/chime4_asr_data.sh
new file mode 120000
index 00000000000..58fbb0a9212
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/chime4_asr_data.sh
@@ -0,0 +1 @@
+../../asr1/local/data.sh
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/chime4_enh_data.sh b/egs2/chime4/enh_asr1/local/chime4_enh_data.sh
new file mode 120000
index 00000000000..d30a4dc12a7
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/chime4_enh_data.sh
@@ -0,0 +1 @@
+../../enh1/local/data.sh
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/clean_chime4_format_data.sh b/egs2/chime4/enh_asr1/local/clean_chime4_format_data.sh
new file mode 120000
index 00000000000..4826e8e382a
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/clean_chime4_format_data.sh
@@ -0,0 +1 @@
+../../enh1/local/clean_chime4_format_data.sh
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/clean_wsj0_data_prep.sh b/egs2/chime4/enh_asr1/local/clean_wsj0_data_prep.sh
new file mode 120000
index 00000000000..5c61d4de024
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/clean_wsj0_data_prep.sh
@@ -0,0 +1 @@
+../../enh1/local/clean_wsj0_data_prep.sh
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/cstr_ndx2flist.pl b/egs2/chime4/enh_asr1/local/cstr_ndx2flist.pl
new file mode 120000
index 00000000000..50660a2b68e
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/cstr_ndx2flist.pl
@@ -0,0 +1 @@
+../../enh1/local/cstr_ndx2flist.pl
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/data.sh b/egs2/chime4/enh_asr1/local/data.sh
new file mode 100755
index 00000000000..dc36d70eae3
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/data.sh
@@ -0,0 +1,89 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+help_message=$(cat << EOF
+Usage: $0 --extra-annotations <path> [--stage <stage>] [--stop_stage <stop_stage>] [--nj <nj>]
+
+  required argument:
+    --extra-annotations: path to a directory containing extra annotations for CHiME4
+                         This is required for preparing et05_simu_isolated_1ch_track.
+    NOTE:
+        You can download it manually from
+            http://spandh.dcs.shef.ac.uk/chime_challenge/CHiME4/download.html
+        Then unzip the downloaded file to CHiME4_diff;
+        You will then find the extra annotations in CHiME4_diff/CHiME3/data/annotations
+
+  optional argument:
+    [--stage]: 1 (default) or 2
+    [--stop_stage]: 1 or 2 (default)
+    [--nj]: number of parallel pool workers in MATLAB
+EOF
+)
+
+
+stage=0
+stop_stage=100
+extra_annotations=
+local_data_opts=
+train_dev=dt05_multi_isolated_1ch_track
+log "$0 $*"
+. utils/parse_options.sh
+
+
+if [ $# -ne 0 ] || [ -z "${extra_annotations}" ]; then
+    echo "${help_message}"
+    exit 2
+fi
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "stage 0: Enh data preparation"
+    local/chime4_enh_data.sh --extra_annotations ${extra_annotations} ${local_data_opts}
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: ASR data preparation"
+    local/chime4_asr_data.sh --stage 0 --stop-stage 1 ${local_data_opts}
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: Enh_ASR data preparation: combine enh and asr data"
+
+    # dummy spk1.scp
+    for dset in tr05_real_noisy train_si284 dt05_real_isolated_1ch_track et05_real_isolated_1ch_track dt05_real_beamformit_2mics dt05_simu_beamformit_2mics et05_real_beamformit_2mics et05_simu_beamformit_2mics dt05_real_beamformit_5mics dt05_simu_beamformit_5mics et05_real_beamformit_5mics et05_simu_beamformit_5mics; do
+        cp data/${dset}/wav.scp data/${dset}/spk1.scp
+    done
+    cp data/tr05_simu_isolated_1ch_track/spk1.scp data/tr05_simu_noisy
+
+    # utt2category
+    <data/tr05_simu_noisy/wav.scp awk '{print($1, "SIMU")}' > data/tr05_simu_noisy/utt2category
+    <data/tr05_real_noisy/wav.scp awk '{print($1, "REAL")}' > data/tr05_real_noisy/utt2category
+    <data/train_si284/wav.scp awk '{print($1, "CLEAN")}' > data/train_si284/utt2category
+    <data/dt05_simu_isolated_1ch_track/wav.scp awk '{print($1, "SIMU")}' > data/dt05_simu_isolated_1ch_track/utt2category
+    <data/dt05_real_isolated_1ch_track/wav.scp awk '{print($1, "REAL")}' > data/dt05_real_isolated_1ch_track/utt2category
+
+    utils/combine_data.sh --extra_files "utt2category spk1.scp" \
+        data/tr05_multi_noisy data/tr05_simu_noisy data/tr05_real_noisy 
+    utils/combine_data.sh --extra_files "utt2category spk1.scp" \
+        data/tr05_multi_noisy_si284 data/tr05_multi_noisy data/train_si284
+    utils/combine_data.sh --extra_files "utt2category spk1.scp" data/${train_dev} \
+        data/dt05_simu_isolated_1ch_track data/dt05_real_isolated_1ch_track
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    log "stage 3: Srctexts preparation"
+    local/chime4_asr_data.sh --stage 2 --stop-stage 2
+fi
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/find_noisy_transcripts.pl b/egs2/chime4/enh_asr1/local/find_noisy_transcripts.pl
new file mode 120000
index 00000000000..ae475b3b32d
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/find_noisy_transcripts.pl
@@ -0,0 +1 @@
+../../enh1/local/find_noisy_transcripts.pl
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/find_transcripts.pl b/egs2/chime4/enh_asr1/local/find_transcripts.pl
new file mode 120000
index 00000000000..5e58a9d0c0e
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/find_transcripts.pl
@@ -0,0 +1 @@
+../../enh1/local/find_transcripts.pl
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/flist2scp.pl b/egs2/chime4/enh_asr1/local/flist2scp.pl
new file mode 120000
index 00000000000..c44f94660eb
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/flist2scp.pl
@@ -0,0 +1 @@
+../../enh1/local/flist2scp.pl
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/localize.m b/egs2/chime4/enh_asr1/local/localize.m
new file mode 120000
index 00000000000..f93a989f0ad
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/localize.m
@@ -0,0 +1 @@
+../../enh1/local/localize.m
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/make_stft.sh b/egs2/chime4/enh_asr1/local/make_stft.sh
new file mode 120000
index 00000000000..cf9038f4ea2
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/make_stft.sh
@@ -0,0 +1 @@
+../../asr1/local/make_stft.sh
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/ndx2flist.pl b/egs2/chime4/enh_asr1/local/ndx2flist.pl
new file mode 120000
index 00000000000..5f79e7991f9
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/ndx2flist.pl
@@ -0,0 +1 @@
+../../asr1/local/ndx2flist.pl
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/normalize_transcript.pl b/egs2/chime4/enh_asr1/local/normalize_transcript.pl
new file mode 120000
index 00000000000..1be067e3703
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/normalize_transcript.pl
@@ -0,0 +1 @@
+../../enh1/local/normalize_transcript.pl
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/path.sh b/egs2/chime4/enh_asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/chime4/enh_asr1/local/real_enhan_chime4_data_prep.sh b/egs2/chime4/enh_asr1/local/real_enhan_chime4_data_prep.sh
new file mode 120000
index 00000000000..13c906eba90
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/real_enhan_chime4_data_prep.sh
@@ -0,0 +1 @@
+../../asr1/local/real_enhan_chime4_data_prep.sh
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/real_ext_chime4_data_prep.sh b/egs2/chime4/enh_asr1/local/real_ext_chime4_data_prep.sh
new file mode 120000
index 00000000000..6620a1d2eb4
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/real_ext_chime4_data_prep.sh
@@ -0,0 +1 @@
+../../enh1/local/real_ext_chime4_data_prep.sh
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/real_noisy_chime4_data_prep.sh b/egs2/chime4/enh_asr1/local/real_noisy_chime4_data_prep.sh
new file mode 120000
index 00000000000..86d5a8cca3b
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/real_noisy_chime4_data_prep.sh
@@ -0,0 +1 @@
+../../enh1/local/real_noisy_chime4_data_prep.sh
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/run_beamform_2ch_track.sh b/egs2/chime4/enh_asr1/local/run_beamform_2ch_track.sh
new file mode 120000
index 00000000000..eb7894626ea
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/run_beamform_2ch_track.sh
@@ -0,0 +1 @@
+../../asr1/local/run_beamform_2ch_track.sh
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/run_beamform_6ch_track.sh b/egs2/chime4/enh_asr1/local/run_beamform_6ch_track.sh
new file mode 120000
index 00000000000..d8609c18f57
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/run_beamform_6ch_track.sh
@@ -0,0 +1 @@
+../../asr1/local/run_beamform_6ch_track.sh
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/show_enhance_results.sh b/egs2/chime4/enh_asr1/local/show_enhance_results.sh
new file mode 120000
index 00000000000..7be0ac655cd
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/show_enhance_results.sh
@@ -0,0 +1 @@
+../../asr1/local/show_enhance_results.sh
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/simu_enhan_chime4_data_prep.sh b/egs2/chime4/enh_asr1/local/simu_enhan_chime4_data_prep.sh
new file mode 120000
index 00000000000..f1227dc8071
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/simu_enhan_chime4_data_prep.sh
@@ -0,0 +1 @@
+../../asr1/local/simu_enhan_chime4_data_prep.sh
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/simu_ext_chime4_data_prep.sh b/egs2/chime4/enh_asr1/local/simu_ext_chime4_data_prep.sh
new file mode 120000
index 00000000000..58b7195ba04
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/simu_ext_chime4_data_prep.sh
@@ -0,0 +1 @@
+../../enh1/local/simu_ext_chime4_data_prep.sh
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/simu_noisy_chime4_data_prep.sh b/egs2/chime4/enh_asr1/local/simu_noisy_chime4_data_prep.sh
new file mode 120000
index 00000000000..da4d7f621c7
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/simu_noisy_chime4_data_prep.sh
@@ -0,0 +1 @@
+../../enh1/local/simu_noisy_chime4_data_prep.sh
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/sym_channel.py b/egs2/chime4/enh_asr1/local/sym_channel.py
new file mode 120000
index 00000000000..9901c190202
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/sym_channel.py
@@ -0,0 +1 @@
+../../asr1/local/sym_channel.py
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/wsj_data_prep.sh b/egs2/chime4/enh_asr1/local/wsj_data_prep.sh
new file mode 120000
index 00000000000..2ba8ba465af
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/wsj_data_prep.sh
@@ -0,0 +1 @@
+../../asr1/local/wsj_data_prep.sh
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/wsj_format_data.sh b/egs2/chime4/enh_asr1/local/wsj_format_data.sh
new file mode 120000
index 00000000000..036fb8b8689
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/wsj_format_data.sh
@@ -0,0 +1 @@
+../../asr1/local/wsj_format_data.sh
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/path.sh b/egs2/chime4/enh_asr1/path.sh
new file mode 120000
index 00000000000..f2720c6899b
--- /dev/null
+++ b/egs2/chime4/enh_asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/enh_asr1/path.sh
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/pyscripts b/egs2/chime4/enh_asr1/pyscripts
new file mode 120000
index 00000000000..008f9bd4bc5
--- /dev/null
+++ b/egs2/chime4/enh_asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/enh_asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/run.sh b/egs2/chime4/enh_asr1/run.sh
new file mode 100755
index 00000000000..c42213e9441
--- /dev/null
+++ b/egs2/chime4/enh_asr1/run.sh
@@ -0,0 +1,45 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+
+extra_annotations=
+
+train_set=tr05_multi_noisy_si284 # tr05_multi_noisy (original training data) or tr05_multi_noisy_si284 (add si284 data)
+valid_set=dt05_multi_isolated_1ch_track
+test_sets="\
+dt05_real_isolated_1ch_track dt05_simu_isolated_1ch_track et05_real_isolated_1ch_track et05_simu_isolated_1ch_track \
+dt05_real_beamformit_2mics dt05_simu_beamformit_2mics et05_real_beamformit_2mics et05_simu_beamformit_2mics \
+dt05_real_beamformit_5mics dt05_simu_beamformit_5mics et05_real_beamformit_5mics et05_simu_beamformit_5mics \
+"
+
+enh_asr_config=conf/train_enh_asr_convtasnet_fbank_transformer.yaml
+inference_config=conf/decode_asr_transformer.yaml
+lm_config=conf/train_lm_transformer.yaml
+
+
+use_word_lm=false
+word_vocab_size=65000
+
+./enh_asr.sh \
+    --lang en \
+    --spk_num 1 \
+    --ref_channel 3 \
+    --local_data_opts "--extra-annotations ${extra_annotations}" \
+    --nlsyms_txt data/nlsyms.txt \
+    --token_type char \
+    --feats_type raw \
+    --feats_normalize utt_mvn \
+    --enh_asr_config "${enh_asr_config}" \
+    --inference_config "${inference_config}" \
+    --lm_config "${lm_config}" \
+    --use_word_lm ${use_word_lm} \
+    --word_vocab_size ${word_vocab_size} \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --bpe_train_text "data/${train_set}/text" \
+    --lm_train_text "data/${train_set}/text data/local/other_text/text" "$@"
diff --git a/egs2/chime4/enh_asr1/scripts b/egs2/chime4/enh_asr1/scripts
new file mode 120000
index 00000000000..6c0f28ef23c
--- /dev/null
+++ b/egs2/chime4/enh_asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/enh_asr1/scripts
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/steps b/egs2/chime4/enh_asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/chime4/enh_asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/utils b/egs2/chime4/enh_asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/chime4/enh_asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/chime6/asr1/README.md b/egs2/chime6/asr1/README.md
new file mode 100644
index 00000000000..45a7200ec9f
--- /dev/null
+++ b/egs2/chime6/asr1/README.md
@@ -0,0 +1,30 @@
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+# RESULTS
+## Environments
+- date: `Tue May  3 16:47:10 EDT 2022`
+- python version: `3.9.12 (main, Apr  5 2022, 06:56:58)  [GCC 7.5.0]`
+- espnet version: `espnet 202204`
+- pytorch version: `pytorch 1.10.1`
+- Git hash: `b757b89d45d5574cebf44e225cbe32e3e9e4f522`
+  - Commit date: `Mon May 2 09:21:08 2022 -0400`
+
+## asr_train_asr_transformer_wavlm_lr1e-3_specaug_accum1_preenc128_warmup20k_raw_en_bpe1000_sp
+- Pretrained model: https://huggingface.co/espnet/simpleoier_chime6_asr_transformer_wavlm_lr1e-3
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_transformer_asr_model_valid.acc.ave_5best/dev_gss_multiarray|7437|58881|69.4|20.2|10.4|8.6|39.1|75.8|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_transformer_asr_model_valid.acc.ave_5best/dev_gss_multiarray|7437|280767|80.6|7.4|12.0|8.9|28.3|76.6|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_transformer_asr_model_valid.acc.ave_5best/dev_gss_multiarray|7437|92680|68.9|17.7|13.4|8.2|39.3|76.6|
+
diff --git a/egs2/chime6/asr1/asr.sh b/egs2/chime6/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/chime6/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/chime6/asr1/cmd.sh b/egs2/chime6/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/chime6/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/chime6/asr1/conf/decode_asr_transformer.yaml b/egs2/chime6/asr1/conf/decode_asr_transformer.yaml
new file mode 100644
index 00000000000..55faa9d7f3f
--- /dev/null
+++ b/egs2/chime6/asr1/conf/decode_asr_transformer.yaml
@@ -0,0 +1,7 @@
+batch_size: 0
+beam_size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.3
+lm-weight: 0.0
diff --git a/egs2/chime6/asr1/conf/fbank.conf b/egs2/chime6/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/chime6/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/chime6/asr1/conf/pbs.conf b/egs2/chime6/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/chime6/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/chime6/asr1/conf/pitch.conf b/egs2/chime6/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/chime6/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/chime6/asr1/conf/queue.conf b/egs2/chime6/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/chime6/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/chime6/asr1/conf/slurm.conf b/egs2/chime6/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/chime6/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/chime6/asr1/conf/train_lm.yaml b/egs2/chime6/asr1/conf/train_lm.yaml
new file mode 100644
index 00000000000..ef244660f43
--- /dev/null
+++ b/egs2/chime6/asr1/conf/train_lm.yaml
@@ -0,0 +1,16 @@
+optim: sgd
+patience: 3
+max_epoch: 20
+batch_type: folded
+batch_size: 1024 # 300 for word LMs
+lm: seq_rnn
+lm_conf:
+    rnn_type: lstm
+    nlayers: 2  # 1 for word LMs
+    unit: 650   # 1000 for word LMs
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
diff --git a/egs2/chime6/asr1/conf/tuning/train_asr_transformer_wavlm_lr1e-3_specaug_accum1_preenc128_warmup20k.yaml b/egs2/chime6/asr1/conf/tuning/train_asr_transformer_wavlm_lr1e-3_specaug_accum1_preenc128_warmup20k.yaml
new file mode 100644
index 00000000000..fca817c749d
--- /dev/null
+++ b/egs2/chime6/asr1/conf/tuning/train_asr_transformer_wavlm_lr1e-3_specaug_accum1_preenc128_warmup20k.yaml
@@ -0,0 +1,87 @@
+# minibatch related
+batch_type: folded
+batch_size: 48
+accum_grad: 1
+grad_clip: 5
+max_epoch: 8
+patience: 4
+# The initialization method for model parameters
+init: xavier_uniform
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+unused_parameters: true
+freeze_param: [
+    "frontend.upstream"
+]
+
+# network architecture
+frontend: s3prl
+frontend_conf:
+    frontend_conf:
+        upstream: wavlm_large  # Note: If the upstream is changed, please change the input_size in the preencoder.
+    download_dir: ./hub
+    multilayer_feature: True
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 128
+
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d2
+    normalize_before: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: embed
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.0
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false
+
+optim: adam
+optim_conf:
+    lr: 0.001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 20000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 100
+    num_freq_mask: 4
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/chime6/asr1/db.sh b/egs2/chime6/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/chime6/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/chime6/asr1/local/check_tools.sh b/egs2/chime6/asr1/local/check_tools.sh
new file mode 120000
index 00000000000..9bbf701a738
--- /dev/null
+++ b/egs2/chime6/asr1/local/check_tools.sh
@@ -0,0 +1 @@
+../../../../egs/chime6/asr1/local/check_tools.sh
\ No newline at end of file
diff --git a/egs2/chime6/asr1/local/data.sh b/egs2/chime6/asr1/local/data.sh
new file mode 100755
index 00000000000..a78620ff834
--- /dev/null
+++ b/egs2/chime6/asr1/local/data.sh
@@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+
+stage=0
+stop_stage=2
+train_set=train_worn_simu_u400k_cleaned
+log "$0 $*"
+. utils/parse_options.sh
+
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+
+if [ ! -e "${CHIME5}" ]; then
+    log "Fill the value of 'CHIME5' of db.sh"
+    exit 1
+fi
+
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: Data preparation"
+
+    log "GSS for CHiME6 corpus"
+    local/prepare_baseline_chime6_data.sh --chime5_corpus ${CHIME5}
+fi
+
+
+nlsyms=data/nlsyms.txt
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: Create non linguistic symbols: ${nlsyms}"
+    cut -f 2- data/${train_set}/text | tr " " "\n" | sort | uniq | grep "\[" > ${nlsyms}
+    cat ${nlsyms}
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/chime6/asr1/local/distant_audio_list b/egs2/chime6/asr1/local/distant_audio_list
new file mode 120000
index 00000000000..a1242f06853
--- /dev/null
+++ b/egs2/chime6/asr1/local/distant_audio_list
@@ -0,0 +1 @@
+../../../../egs/chime6/asr1/local/distant_audio_list
\ No newline at end of file
diff --git a/egs2/chime6/asr1/local/extract_noises.py b/egs2/chime6/asr1/local/extract_noises.py
new file mode 120000
index 00000000000..01379f56180
--- /dev/null
+++ b/egs2/chime6/asr1/local/extract_noises.py
@@ -0,0 +1 @@
+../../../../egs/chime6/asr1/local/extract_noises.py
\ No newline at end of file
diff --git a/egs2/chime6/asr1/local/generate_chime6_data.sh b/egs2/chime6/asr1/local/generate_chime6_data.sh
new file mode 120000
index 00000000000..7def41bbe7d
--- /dev/null
+++ b/egs2/chime6/asr1/local/generate_chime6_data.sh
@@ -0,0 +1 @@
+../../../../egs/chime6/asr1/local/generate_chime6_data.sh
\ No newline at end of file
diff --git a/egs2/chime6/asr1/local/install_pb_chime5.sh b/egs2/chime6/asr1/local/install_pb_chime5.sh
new file mode 120000
index 00000000000..f36ce54b6dd
--- /dev/null
+++ b/egs2/chime6/asr1/local/install_pb_chime5.sh
@@ -0,0 +1 @@
+../../../../egs/chime6/asr1/local/install_pb_chime5.sh
\ No newline at end of file
diff --git a/egs2/chime6/asr1/local/json2text.py b/egs2/chime6/asr1/local/json2text.py
new file mode 120000
index 00000000000..6a35baf5463
--- /dev/null
+++ b/egs2/chime6/asr1/local/json2text.py
@@ -0,0 +1 @@
+../../../../egs/chime6/asr1/local/json2text.py
\ No newline at end of file
diff --git a/egs2/chime6/asr1/local/make_noise_list.py b/egs2/chime6/asr1/local/make_noise_list.py
new file mode 120000
index 00000000000..cb215b86f53
--- /dev/null
+++ b/egs2/chime6/asr1/local/make_noise_list.py
@@ -0,0 +1 @@
+../../../../egs/chime6/asr1/local/make_noise_list.py
\ No newline at end of file
diff --git a/egs2/chime6/asr1/local/path.sh b/egs2/chime6/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/chime6/asr1/local/prepare_baseline_chime6_data.sh b/egs2/chime6/asr1/local/prepare_baseline_chime6_data.sh
new file mode 120000
index 00000000000..b8e3f74395c
--- /dev/null
+++ b/egs2/chime6/asr1/local/prepare_baseline_chime6_data.sh
@@ -0,0 +1 @@
+../../../../egs/chime6/asr1/local/prepare_baseline_chime6_data.sh
\ No newline at end of file
diff --git a/egs2/chime6/asr1/local/prepare_data.sh b/egs2/chime6/asr1/local/prepare_data.sh
new file mode 120000
index 00000000000..8d61f0b8cb7
--- /dev/null
+++ b/egs2/chime6/asr1/local/prepare_data.sh
@@ -0,0 +1 @@
+../../../../egs/chime6/asr1/local/prepare_data.sh
\ No newline at end of file
diff --git a/egs2/chime6/asr1/local/prepare_dict.sh b/egs2/chime6/asr1/local/prepare_dict.sh
new file mode 120000
index 00000000000..da05482654b
--- /dev/null
+++ b/egs2/chime6/asr1/local/prepare_dict.sh
@@ -0,0 +1 @@
+../../../../egs/chime6/asr1/local/prepare_dict.sh
\ No newline at end of file
diff --git a/egs2/chime6/asr1/local/run_gss.sh b/egs2/chime6/asr1/local/run_gss.sh
new file mode 120000
index 00000000000..e48d208459c
--- /dev/null
+++ b/egs2/chime6/asr1/local/run_gss.sh
@@ -0,0 +1 @@
+../../../../egs/chime6/asr1/local/run_gss.sh
\ No newline at end of file
diff --git a/egs2/chime6/asr1/local/train_lms_srilm.sh b/egs2/chime6/asr1/local/train_lms_srilm.sh
new file mode 120000
index 00000000000..724f7cc6e03
--- /dev/null
+++ b/egs2/chime6/asr1/local/train_lms_srilm.sh
@@ -0,0 +1 @@
+../../../../egs/chime6/asr1/local/train_lms_srilm.sh
\ No newline at end of file
diff --git a/egs2/chime6/asr1/local/wer_output_filter b/egs2/chime6/asr1/local/wer_output_filter
new file mode 120000
index 00000000000..9d0fbd9066b
--- /dev/null
+++ b/egs2/chime6/asr1/local/wer_output_filter
@@ -0,0 +1 @@
+../../../../egs/chime6/asr1/local/wer_output_filter
\ No newline at end of file
diff --git a/egs2/chime6/asr1/path.sh b/egs2/chime6/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/chime6/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/chime6/asr1/pyscripts b/egs2/chime6/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/chime6/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/chime6/asr1/run.sh b/egs2/chime6/asr1/run.sh
new file mode 100755
index 00000000000..99407dc0b08
--- /dev/null
+++ b/egs2/chime6/asr1/run.sh
@@ -0,0 +1,44 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+
+
+train_set=train_worn_simu_u400k_cleaned
+valid_set=dev_gss_multiarray
+test_sets="dev_gss_multiarray"
+
+
+asr_config="conf/tuning/train_asr_transformer_wavlm_lr1e-3_specaug_accum1_preenc128_warmup20k.yaml"
+inference_config="conf/decode_asr_transformer.yaml"
+lm_config="conf/train_lm.yaml"
+
+bpe_nlsyms="[inaudible],[laughs],[noise]"
+
+use_lm=false
+use_word_lm=false
+word_vocab_size=65000
+
+./asr.sh \
+    --lang en \
+    --token_type bpe \
+    --nbpe 1000 \
+    --bpe_nlsyms "${bpe_nlsyms}" \
+    --nlsyms_txt "data/nlsyms.txt" \
+    --feats_type raw \
+    --audio_format "flac" \
+    --speed_perturb_factors "0.9 1.0 1.1" \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --use_lm ${use_lm} \
+    --lm_config "${lm_config}" \
+    --use_word_lm ${use_word_lm} \
+    --word_vocab_size ${word_vocab_size} \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --bpe_train_text "data/${train_set}/text" \
+    --lm_train_text "data/${train_set}/text" "$@"
diff --git a/egs2/chime6/asr1/scripts b/egs2/chime6/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/chime6/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/chime6/asr1/steps b/egs2/chime6/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/chime6/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/chime6/asr1/utils b/egs2/chime6/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/chime6/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/clarity21/enh1/README.md b/egs2/clarity21/enh1/README.md
new file mode 100644
index 00000000000..4b72dd15897
--- /dev/null
+++ b/egs2/clarity21/enh1/README.md
@@ -0,0 +1,31 @@
+<!-- Generated by scripts/utils/show_enh_score.sh -->
+<!-- These results are from the code before refactoring  -->
+
+# Clarity CEC1 First Clarity Enhancement Challenge
+
+See for https://github.com/claritychallenge/clarity_CC/blob/master/clarity_CEC1/INSTALL.md instructions on how to download the data
+
+For more info on Clarity CEC1 see https://claritychallenge.github.io/clarity_CEC1_doc/
+
+
+# RESULTS
+## Environments
+- date: `Tue Apr 12 20:54:54 UTC 2022`
+- python version: `3.9.7 (default, Sep 16 2021, 13:09:58)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.5a1`
+- pytorch version: `pytorch 1.10.1`
+- Git hash: `46eaa5eb6bea11cc33927392dca7888921491d8c`
+  - Commit date: `Sat Mar 26 22:35:44 2022 +0100`
+
+
+## enh_train_enh_beamformer_mvdr_raw
+
+config: conf/tuning/train_enh_beamformer_mvdr.yaml
+huggingface_link: https://huggingface.co/popcornell/clarity21_train_enh_beamformer_mvdr
+
+|dataset|STOI|SAR|SDR|SIR|
+|---|---|---|---|---|
+|enhanced_dev|0.96|13.02|13.02|0.00|
+
+
+
diff --git a/egs2/clarity21/enh1/cmd.sh b/egs2/clarity21/enh1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/clarity21/enh1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/clarity21/enh1/conf/pbs.conf b/egs2/clarity21/enh1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/clarity21/enh1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/clarity21/enh1/conf/queue.conf b/egs2/clarity21/enh1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/clarity21/enh1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/clarity21/enh1/conf/slurm.conf b/egs2/clarity21/enh1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/clarity21/enh1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/clarity21/enh1/conf/tuning/train_enh_beamformer_mvdr.yaml b/egs2/clarity21/enh1/conf/tuning/train_enh_beamformer_mvdr.yaml
new file mode 100644
index 00000000000..026dfce50d6
--- /dev/null
+++ b/egs2/clarity21/enh1/conf/tuning/train_enh_beamformer_mvdr.yaml
@@ -0,0 +1,75 @@
+optim: adam
+init: xavier_uniform
+max_epoch: 8
+batch_type: folded
+batch_size: 1
+num_workers: 4
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 0
+patience: 4
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: reducelronplateau
+scheduler_conf:
+    mode: min
+    factor: 0.5
+    patience: 1
+encoder: stft
+encoder_conf:
+    n_fft: 1024
+    hop_length: 512
+    use_builtin_complex: False
+decoder: stft
+decoder_conf:
+    n_fft: 1024
+    hop_length: 512
+separator: wpe_beamformer
+separator_conf:
+    num_spk: 1
+    loss_type: spectrum
+    # Dereverberation options
+    use_wpe:  False
+    # Beamformer options
+    use_beamformer: True
+    bnet_type: blstmp
+    blayers: 3
+    bunits: 300
+    bprojs: 320
+    badim: 320
+    ref_channel: 0
+    use_noise_mask: True
+    bnonlinear: sigmoid
+    beamformer_type: mvdr_souden
+    rtf_iterations: 2
+    bdropout_rate: 0.0
+    shared_power: True
+    # For numerical stability
+    diagonal_loading: False
+    diag_eps_wpe: 1e-4
+    diag_eps_bf: 1e-4
+    mask_flooring: False
+    flooring_thres_wpe: 1e-6
+    flooring_thres_bf: 1e-6
+    use_torch_solver: True
+
+criterions: 
+  # The first criterion
+  - name: snr
+    conf:
+      eps: 1e-8
+    # the wrapper for the current criterion
+    # for single-talker case, we simplely use fixed_order wrapper
+    wrapper: fixed_order
+    wrapper_conf:
+      weight: 1.0
\ No newline at end of file
diff --git a/egs2/clarity21/enh1/db.sh b/egs2/clarity21/enh1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/clarity21/enh1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/clarity21/enh1/enh.sh b/egs2/clarity21/enh1/enh.sh
new file mode 120000
index 00000000000..8fd33b0b191
--- /dev/null
+++ b/egs2/clarity21/enh1/enh.sh
@@ -0,0 +1 @@
+../../TEMPLATE/enh1/enh.sh
\ No newline at end of file
diff --git a/egs2/clarity21/enh1/local/data.sh b/egs2/clarity21/enh1/local/data.sh
new file mode 100644
index 00000000000..c5cd35efdb7
--- /dev/null
+++ b/egs2/clarity21/enh1/local/data.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+help_message=$(cat << EOF
+Usage: $0 --clarity_root <path> [--sample_rate <sample_rate>]
+
+  required argument:
+    --clarity_root: path to clarity dataset root folder. i.e. folder which contains train and dev subfolders.
+    NOTE:
+
+
+  optional argument:
+    [--sample_rate]: 16000 (default) or 44100
+EOF
+)
+
+clarity_root=
+sample_rate=16000
+
+. utils/parse_options.sh
+
+# check for sox
+! command -v sox &>/dev/null && echo "sox: command not found" && exit 1;
+
+python local/prep_data.py --clarity_root ${clarity_root} --fs ${sample_rate}
\ No newline at end of file
diff --git a/egs2/clarity21/enh1/local/path.sh b/egs2/clarity21/enh1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/clarity21/enh1/local/prep_data.py b/egs2/clarity21/enh1/local/prep_data.py
new file mode 100644
index 00000000000..fa61e757742
--- /dev/null
+++ b/egs2/clarity21/enh1/local/prep_data.py
@@ -0,0 +1,130 @@
+import argparse
+import json
+import os
+
+
+parser = argparse.ArgumentParser("Clarity")
+parser.add_argument(
+    "--clarity_root",
+    type=str,
+    help="Path to Clarity Challenge root folder "
+    "(Folder containing train, dev and metadata dirs)",
+)
+parser.add_argument(
+    "--fs",
+    type=int,
+    default=16000,
+    help="Sample rate to use, by default we resample to 16000 Hz",
+)
+
+
+def prepare_data(clarity_root, samplerate):
+
+    output_folder = "./data"
+    ids = {"train": set(), "dev": set()}
+
+    for ds_split in ids.keys():
+        metafile = os.path.join(
+            clarity_root, "metadata", "scenes.{}.json".format(ds_split)
+        )
+        with open(metafile, "r") as f:
+            metadata = json.load(f)
+        for ex in metadata:
+            ids[ds_split].add(ex["scene"])
+
+    for ds_split in ids.keys():
+        ids[ds_split] = sorted(list(ids[ds_split]))
+
+    # create wav.scp
+    for ds_split in ids.keys():
+        os.makedirs(os.path.join(output_folder, ds_split), exist_ok=True)
+        with open(os.path.join(output_folder, ds_split, "wav.scp"), "w") as f:
+            for ex_id in ids[ds_split]:
+                array_files = [
+                    os.path.join(
+                        clarity_root,
+                        ds_split,
+                        "scenes",
+                        "{}_mixed_CH{}.wav".format(ex_id, idx),
+                    )
+                    for idx in range(1, 4)
+                ]
+
+                assert all([os.path.exists(x) for x in array_files]), (
+                    "Some file do not seem to exist, "
+                    "please check your root folder, is the path correct ?"
+                )
+                array_files = " ".join(array_files)
+                f.write(
+                    "{} sox -M {} -c 6 -b 16 -r {} -t wav - |\n".format(
+                        ex_id, array_files, samplerate
+                    )
+                )
+
+        with open(os.path.join(output_folder, ds_split, "noise1.scp"), "w") as f:
+            for ex_id in ids[ds_split]:
+                array_file = os.path.join(
+                    clarity_root,
+                    ds_split,
+                    "scenes",
+                    "{}_interferer_CH1.wav".format(ex_id),
+                )
+                assert os.path.exists(array_file), (
+                    "Some file do not seem to exist, "
+                    "please check your root folder, is the path correct ?"
+                )
+                f.write(
+                    "{} sox {} -b 16 -r {} -t wav - remix 1 |\n".format(
+                        ex_id, array_file, samplerate
+                    )
+                )
+
+        with open(os.path.join(output_folder, ds_split, "spk1.scp"), "w") as f:
+            for ex_id in ids[ds_split]:
+                array_file = os.path.join(
+                    clarity_root, ds_split, "scenes", "{}_target_CH1.wav".format(ex_id)
+                )
+                assert os.path.exists(array_file), (
+                    "Some file do not seem to exist, "
+                    "please check your root folder, is the path correct ?"
+                )
+                f.write(
+                    "{} sox {}  -b 16 -r {} -t wav - remix 1 |\n".format(
+                        ex_id, array_file, samplerate
+                    )
+                )
+
+        with open(os.path.join(output_folder, ds_split, "noise.scp"), "w") as f:
+            for ex_id in ids[ds_split]:
+                array_file = os.path.join(
+                    clarity_root,
+                    ds_split,
+                    "scenes",
+                    "{}_interferer_CH1.wav".format(ex_id),
+                )
+                assert os.path.exists(array_file), (
+                    "Some file do not seem to exist, "
+                    "please check your root folder, is the path correct ?"
+                )
+                f.write(
+                    "{} sox {}  -b 16 -r {} -t wav - remix 1 |\n".format(
+                        ex_id, array_file, samplerate
+                    )
+                )
+
+        with open(os.path.join(output_folder, ds_split, "text.scp"), "w") as f:
+            for ex_id in ids[ds_split]:
+                f.write("{} dummy\n".format(ex_id))
+
+        with open(os.path.join(output_folder, ds_split, "utt2spk"), "w") as f:
+            for ex_id in ids[ds_split]:
+                f.write("{} dummy\n".format(ex_id))
+
+        with open(os.path.join(output_folder, ds_split, "spk2utt"), "w") as f:
+            for ex_id in ids[ds_split]:
+                f.write("dummy {}\n".format(ex_id))
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    prepare_data(args.clarity_root, args.fs)
diff --git a/egs2/clarity21/enh1/path.sh b/egs2/clarity21/enh1/path.sh
new file mode 120000
index 00000000000..eb217d35673
--- /dev/null
+++ b/egs2/clarity21/enh1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/enh1/path.sh
\ No newline at end of file
diff --git a/egs2/clarity21/enh1/pyscripts b/egs2/clarity21/enh1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/clarity21/enh1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/clarity21/enh1/run.sh b/egs2/clarity21/enh1/run.sh
new file mode 100755
index 00000000000..143f7c06240
--- /dev/null
+++ b/egs2/clarity21/enh1/run.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+sample_rate=16000 # by default we resample to 16k
+
+# put the path here to the clarity first enhancement challenge folder which contains
+# dev  hrir  metadata  train subfolders
+clarity_root=/raid/users/popcornell/Clarity/target_dir/clarity_CEC1_data/clarity_data/
+
+train_set=train
+valid_set=dev
+test_sets="dev"
+
+./enh.sh \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --fs ${sample_rate} \
+    --ngpu 1 \
+    --spk_num 1 \
+    --ref_channel 0 \
+    --local_data_opts "--clarity_root ${clarity_root} --sample_rate ${sample_rate}" \
+    --enh_config conf/tuning/train_enh_beamformer_mvdr.yaml \
+    --use_dereverb_ref false \
+    --use_noise_ref true \
+    --inference_model "valid.loss.best.pth" \
+    "$@"
diff --git a/egs2/clarity21/enh1/scripts b/egs2/clarity21/enh1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/clarity21/enh1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/clarity21/enh1/steps b/egs2/clarity21/enh1/steps
new file mode 120000
index 00000000000..69ab7056139
--- /dev/null
+++ b/egs2/clarity21/enh1/steps
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/steps
\ No newline at end of file
diff --git a/egs2/clarity21/enh1/utils b/egs2/clarity21/enh1/utils
new file mode 120000
index 00000000000..e18ae14b549
--- /dev/null
+++ b/egs2/clarity21/enh1/utils
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/utils
\ No newline at end of file
diff --git a/egs2/conferencingspeech21/enh1/cmd.sh b/egs2/conferencingspeech21/enh1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/conferencingspeech21/enh1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/conferencingspeech21/enh1/conf/pbs.conf b/egs2/conferencingspeech21/enh1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/conferencingspeech21/enh1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/conferencingspeech21/enh1/conf/queue.conf b/egs2/conferencingspeech21/enh1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/conferencingspeech21/enh1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/conferencingspeech21/enh1/conf/slurm.conf b/egs2/conferencingspeech21/enh1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/conferencingspeech21/enh1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/conferencingspeech21/enh1/conf/tuning/train_enh_beamformer_mvdr.yaml b/egs2/conferencingspeech21/enh1/conf/tuning/train_enh_beamformer_mvdr.yaml
new file mode 100644
index 00000000000..46784a14064
--- /dev/null
+++ b/egs2/conferencingspeech21/enh1/conf/tuning/train_enh_beamformer_mvdr.yaml
@@ -0,0 +1,72 @@
+optim: adam
+init: xavier_uniform
+max_epoch: 70
+batch_type: folded
+batch_size: 8
+num_workers: 4
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 0
+patience: 4
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: reducelronplateau
+scheduler_conf:
+    mode: min
+    factor: 0.5
+    patience: 1
+encoder: stft
+encoder_conf:
+    n_fft: 512
+    hop_length: 128
+    use_builtin_complex: False
+decoder: stft
+decoder_conf:
+    n_fft: 512
+    hop_length: 128
+separator: wpe_beamformer
+separator_conf:
+    num_spk: 1
+    loss_type: mask_mse
+    use_wpe: True
+    wnet_type: blstmp
+    wlayers: 3
+    wunits: 300
+    wprojs: 320
+    wdropout_rate: 0.0
+    taps: 5
+    delay: 3
+    use_dnn_mask_for_wpe: False
+    use_beamformer: True
+    bnet_type: blstmp
+    blayers: 3
+    bunits: 512
+    bprojs: 512
+    badim: 320
+    ref_channel: 0
+    use_noise_mask: False
+    beamformer_type: mvdr_souden
+    bdropout_rate: 0.0
+
+
+criterions: 
+  # The first criterion
+  - name: mse 
+    conf:
+      compute_on_mask: True
+      mask_type: PSM^2
+    # the wrapper for the current criterion
+    # for single-talker case, we simplely use fixed_order wrapper
+    wrapper: fixed_order
+    wrapper_conf:
+      weight: 1.0
diff --git a/egs2/conferencingspeech21/enh1/db.sh b/egs2/conferencingspeech21/enh1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/conferencingspeech21/enh1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/conferencingspeech21/enh1/enh.sh b/egs2/conferencingspeech21/enh1/enh.sh
new file mode 120000
index 00000000000..8fd33b0b191
--- /dev/null
+++ b/egs2/conferencingspeech21/enh1/enh.sh
@@ -0,0 +1 @@
+../../TEMPLATE/enh1/enh.sh
\ No newline at end of file
diff --git a/egs2/conferencingspeech21/enh1/local/config_from_generated.py b/egs2/conferencingspeech21/enh1/local/config_from_generated.py
new file mode 100755
index 00000000000..25758da1024
--- /dev/null
+++ b/egs2/conferencingspeech21/enh1/local/config_from_generated.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python
+
+# Copyright 2021  Shanghai Jiao Tong University (Authors: Wangyou Zhang)
+# Apache 2.0
+import argparse
+from pathlib import Path
+
+
+def construct_path_dict(wav_list):
+    path_dict = {}
+    with wav_list.open("r") as f:
+        for wavpath in f:
+            wavpath = wavpath.strip()
+            if not wavpath:
+                continue
+            wavname = Path(wavpath).expanduser().resolve().with_suffix("").name
+            path_dict[wavname] = wavpath
+    return path_dict
+
+
+def prepare_config(args):
+    audiodir = Path(args.audiodir).expanduser()
+    clean_list = Path(args.clean_list).expanduser().resolve()
+    noise_list = Path(args.noise_list).expanduser().resolve()
+    outfile = Path(args.outfile).expanduser().resolve()
+
+    speech_data = construct_path_dict(clean_list)
+    noise_data = construct_path_dict(noise_list)
+    audios = {
+        folder: {
+            path.with_suffix("").name: str(path)
+            for path in (audiodir / folder).rglob("*." + args.audio_format)
+        }
+        for folder in ("mix", "noreverb_ref", "reverb_ref")
+    }
+    keys = audios["mix"].keys()
+    assert keys == audios["noreverb_ref"].keys() == audios["reverb_ref"].keys()
+
+    with outfile.open("w") as out:
+        for name in keys:
+            path_clean, path_noise, path_rir, start_time, snr, scale = name.split("#")
+            path_clean = speech_data[path_clean]
+            path_noise = noise_data[path_noise]
+            out.write(
+                f"{path_clean} {start_time} {path_noise} "
+                f"/path/{args.tag}/{path_rir}.wav {snr} {scale}\n"
+            )
+
+
+def get_parser():
+    """Argument parser."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--audiodir",
+        type=str,
+        required=True,
+        help="Paths to the directory containing simulated audio files",
+    )
+    parser.add_argument("--audio-format", type=str, default="wav")
+    parser.add_argument(
+        "--clean_list",
+        type=str,
+        required=True,
+        help="Path to the list of clean speech audio file for simulation",
+    )
+    parser.add_argument(
+        "--noise_list",
+        type=str,
+        required=True,
+        help="Path to the list of noise audio file for simulation",
+    )
+    parser.add_argument("--outfile", type=str, required=True)
+    parser.add_argument("--tag", type=str, default="linear")
+    return parser
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+    prepare_config(args)
diff --git a/egs2/conferencingspeech21/enh1/local/data.sh b/egs2/conferencingspeech21/enh1/local/data.sh
new file mode 100755
index 00000000000..578370cbaa9
--- /dev/null
+++ b/egs2/conferencingspeech21/enh1/local/data.sh
@@ -0,0 +1,359 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+help_message=$(cat << EOF
+Usage: $0 [--stage <stage>] [--stop_stage <stop_stage>] --use_reverb_ref <true/false> --official-data-dir <official_data_dir>
+
+  required argument:
+    --official-data-dir: path to the directory of offical data for ConferencingSpeech2021 with the following structure:
+
+        <official_data_dir>
+         |-- Development_test_set/
+         |   |-- playback+noise/
+         |   |-- readme.txt
+         |   |-- realrecording_cut/
+         |   |-- semireal+noise/
+         |   |-- simu_multiple_MA/
+         |   \-- simu_single_MA/
+         |
+         |-- Training_set/
+         |   |-- circle_rir/
+         |   |-- linear_rir/
+         |   |-- non_uniform_linear_rir/
+         |   |-- readme.txt
+         |   |-- selected_lists/
+         |   \-- train_record_noise/
+         |
+         |-- Evaluation_set/
+         |   |-- eval_data/
+         |   |   |--tast1/
+         |   |   \--task2/
+         |   \-- Readme.txt
+         |
+         \-- config_files_simulation_train/
+             |-- train_simu_circle.config
+             |-- train_simu_linear.config
+             \-- train_simu_non_uniform.config
+
+  optional argument:
+    [--stage]: 1 (default) or 4
+    [--stop_stage]: 1 or 4 (default)
+    [--use_reverb_ref]: true or false (default)
+EOF
+)
+
+
+stage=1
+stop_stage=4
+official_data_dir=
+use_official_dev=true
+use_reverb_ref=true
+
+log "$0 $*"
+. utils/parse_options.sh
+
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+if [ $# -gt 0 ]; then
+    log "${help_message}"
+    exit 2
+fi
+
+if [ ! -e "${official_data_dir}" ]; then
+    log "${help_message}"
+    log "No such directory for --official-data-dir: '${official_data_dir}'"
+    exit 1
+fi
+
+if [ ! -e "${AISHELL}" ]; then
+    log "Fill the value of 'AISHELL' in db.sh"
+    log "(available at http://openslr.org/33/)"
+    exit 1
+fi
+
+if [ ! -e "${AISHELL3}" ]; then
+    log "Fill the value of 'AISHELL3' in db.sh"
+    log "(available at http://openslr.org/93/)"
+    exit 1
+fi
+
+if [ ! -e "${LIBRISPEECH}" ]; then
+    log "Fill the value of 'LIBRISPEECH' in db.sh"
+    log "(available at http://openslr.org/12/)"
+    exit 1
+elif [ ! -e "${LIBRISPEECH}/train-clean-360" ]; then
+    log "Please ensure '${LIBRISPEECH}/train-clean-360' exists"
+    exit 1
+fi
+
+if [ ! -e "${VCTK}" ]; then
+    log "Fill the value of 'VCTK' in db.sh"
+    log "(Version 0.80, available at https://datashare.ed.ac.uk/handle/10283/2651)"
+    exit 1
+fi
+
+if [ ! -e "${MUSAN}" ]; then
+    log "Fill the value of 'MUSAN' in db.sh"
+    log "(available at http://openslr.org/17/)"
+    exit 1
+fi
+
+if [ ! -e "${AUDIOSET}" ]; then
+    log "Fill the value of 'AUDIOSET' in db.sh"
+    log "(available at https://github.com/marc-moreaux/audioset_raw)"
+    exit 1
+fi
+
+
+odir="${PWD}/local"
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: Prepare Training and Dev Data for Simulation"
+
+    if [ ! -d "${odir}/ConferencingSpeech2021" ]; then
+        git clone https://github.com/ConferencingSpeech/ConferencingSpeech2021.git "${odir}/ConferencingSpeech2021"
+    fi
+    (
+        cd "${odir}/ConferencingSpeech2021"
+        # This patch is for simulation/mix_wav.py at commit 49d3b2fc47
+        git apply "${odir}/fix_simulation_script.patch"
+        python -m pip install -r requirements.txt
+    )
+
+    rir_dir="${official_data_dir}/Training_set"
+
+    # make symbolic links for each corpus to match the data preparation script
+    corpora_dir="${odir}/ConferencingSpeech2021/corpora"
+    mkdir -p "${corpora_dir}"
+    ln -s "${AISHELL}" "${corpora_dir}/aishell_1"
+    ln -s "${AISHELL3}" "${corpora_dir}/aishell_3"
+    ln -s "${VCTK}" "${corpora_dir}/vctk"
+    ln -s "${LIBRISPEECH}/train-clean-360" "${corpora_dir}/librispeech_360"
+    ln -s "${MUSAN}" "${corpora_dir}/musan"
+    ln -s "${AUDIOSET}" "${corpora_dir}/audioset"
+    ln -s "${rir_dir}/linear_rir" "${corpora_dir}/linear"
+    ln -s "${rir_dir}/circle_rir" "${corpora_dir}/circle"
+    ln -s "${rir_dir}/non_uniform_linear_rir" "${corpora_dir}/non_uniform"
+
+    sed -i -e "s#aishell_1='.*'#aishell_1='${corpora_dir}/aishell_1'#g" \
+        -e "s#aishell_3='.*'#aishell_3='${corpora_dir}/aishell_3'#g" \
+        -e "s#vctk='.*'#vctk='${corpora_dir}/vctk'#g" \
+        -e "s#librispeech='.*'#librispeech='${corpora_dir}/librispeech_360'#g" \
+        -e "s#musan='.*'#musan='${corpora_dir}/musan'#g" \
+        -e "s#audioset='.*'#audioset='${corpora_dir}/audioset'#g" \
+        -e "s#linear='.*'#linear='${corpora_dir}/linear'#g" \
+        -e "s#circle='.*'#circle='${corpora_dir}/circle'#g" \
+        -e "s#non_uniform='.*'#non_uniform='${corpora_dir}/non_uniform'#g" \
+        -e "s#find \${name_path} #find \${name_path}/ #g" \
+        "${odir}/ConferencingSpeech2021/simulation/prepare.sh"
+
+    # This script will generate ${odir}/ConferencingSpeech2021/simulation/data/{train,dev}_*.config
+    (
+        cd "${odir}/ConferencingSpeech2021/simulation"
+        # NOTE (wangyou): 1000+ samples in ConferencingSpeech2021/selected_list/train/audioset.name
+        # might be unavailable from YouTube due to violation of policies, copyright, and other causes.
+        # In this case, you may want to remove them from the list.
+        bash ./prepare.sh
+    )
+    # If the above script fail to finish successfully, please use the following command instead:
+    #
+    # local/prepare.sh \
+    #     --corpora_dir "${corpora_dir}" \
+    #     --selected_list_dir "${odir}/ConferencingSpeech2021/selected_lists" \
+    #     --outdir "${odir}/ConferencingSpeech2021/simulation/data"
+
+    # Fill ${odir}/ConferencingSpeech2021/simulation/data/dev_*.config with real paths
+    simu_data_path="${odir}/ConferencingSpeech2021/simulation/data"
+    for name in linear circle non_uniform; do
+        python local/prepare_simu_config.py \
+            "${simu_data_path}/dev_${name}_simu_mix.config" \
+            --clean_list "${simu_data_path}/dev_clean.lst" \
+            --noise_list "${simu_data_path}/dev_noise.lst" \
+            --rir_list "${simu_data_path}/dev_${name}_rir.lst" \
+            --outfile "${simu_data_path}/dev_${name}_simu_mix.config"
+    done
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: Data Simulation"
+
+    if ${use_official_dev}; then
+        log "Skip simulation (using official development data in track2)"
+
+        datadir="${odir}/ConferencingSpeech2021/simulation/data/wavs/dev"
+        for name in linear circle non_uniform; do
+            mkdir -p "${datadir}/simu_${name}"
+        done
+        track2dir="${official_data_dir}/Development_test_set/simu_multiple_MA"
+        for folder in mix reverb_ref noreverb_ref; do
+            ln -s "${track2dir}/dev_simu_linear_uniform_track2/${folder}" "${datadir}/simu_linear/${folder}"
+            ln -s "${track2dir}/dev_simu_circular_track2/${folder}" "${datadir}/simu_circle/${folder}"
+            ln -s "${track2dir}/dev_simu_linear_nonuniform_track2/${folder}" "${datadir}/simu_non_uniform/${folder}"
+        done
+        simu_data_path="${odir}/ConferencingSpeech2021/simulation/data"
+        for name in linear circle non_uniform; do
+            python local/config_from_generated.py \
+                --audiodir "${datadir}/simu_${name}" \
+                --audio-format wav \
+                --clean_list "${simu_data_path}/dev_clean.lst" \
+                --noise_list "${simu_data_path}/dev_noise.lst" \
+                --tag ${name} \
+                --outfile "${datadir}/simu_${name}/dev_${name}_simu_mix.config"
+        done
+    else
+        # Expected data to be generated:
+        # ${odir}/ConferencingSpeech2021/simulation/data/wav/dev/
+        #  |-- simu_circle/
+        #  |   |-- dev_circle_simu_mix.config
+        #  |   |-- mix/*.wav             (1588 samples * 8 ch * 6 sec)
+        #  |   |-- noreverb_ref/*.wav    (1588 samples * 8 ch * 6 sec)
+        #  |   \-- reverb_ref/*.wav      (1588 samples * 8 ch * 6 sec)
+        #  |-- simu_linear/
+        #  |   |-- dev_linear_simu_mix.config
+        #  |   |-- mix/*.wav             (1588 samples * 8 ch * 6 sec)
+        #  |   |-- noreverb_ref/*.wav    (1588 samples * 8 ch * 6 sec)
+        #  |   \-- reverb_ref/*.wav      (1588 samples * 8 ch * 6 sec)
+        #  \-- simu_non_uniform/
+        #      |-- dev_non_uniform_simu_mix.config
+        #      |-- mix/*.wav             (1588 samples * 8 ch * 6 sec)
+        #      |-- noreverb_ref/*.wav    (1588 samples * 8 ch * 6 sec)
+        #      \-- reverb_ref/*.wav      (1588 samples * 8 ch * 6 sec)
+        (
+            cd "${odir}/ConferencingSpeech2021/simulation"
+            for name in linear circle non_uniform; do
+                log "Simulating with dev_${name}_simu_mix.config"
+                python mix_wav.py \
+                    --mix_config_path data/dev_${name}_simu_mix.config \
+                    --save_dir data/wavs/dev/simu_${name}/ \
+                    --chunk_len 6 \
+                    --generate_config False
+            done
+        )
+    fi
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    log "stage 3: Prepare data directory"
+
+    tmpdir=$(mktemp -d /tmp/conferencingspeech.XXXX)
+    ##############################################
+    # Training data will be generated on the fly #
+    ##############################################
+    mkdir -p data/train
+    simu_data_path="${odir}/ConferencingSpeech2021/simulation/data"
+
+    # Prepare wav.scp and spk1.scp
+    sed -e 's/\.\(wav\|flac\)//' "${simu_data_path}/train_clean.lst" | \
+        awk -F '/' '{print $NF}' > "${tmpdir}/utt_clean.list"
+    paste -d' ' "${tmpdir}/utt_clean.list" "${simu_data_path}/train_clean.lst" | sort -u > data/train/wav.scp
+    cp data/train/wav.scp data/train/spk1.scp
+
+    # Prepare utt2spk for data from aishell_1, aishell_3, librispeech_360, and vctk
+    # path -> spkid (aishell_1): .../S0724/BAC009S0724W0121.wav -> S0724
+    # path -> spkid (aishell_3): .../SSB0261/SSB02610250.wav -> SSB0261
+    # path -> spkid (librispeech_360): .../7932/93470/7932-93470-0006.flac -> 7932-93470
+    # path -> spkid (vctk): .../p278/p278_202.wav -> p278
+    sed -e 's/\.\(wav\|flac\)//' "${simu_data_path}/train_clean.lst" | \
+        awk 'BEGIN{ FS="/" } {
+            if(match($0, "librispeech_360")) {i=NF-2; j=NF-1; printf("%s %s-%s\n",$NF,$i,$j)}
+            else {i=NF-1; printf("%s %s\n",$NF,$i)}
+        }' | sort -u > data/train/utt2spk
+    utils/utt2spk_to_spk2utt.pl data/train/utt2spk > data/train/spk2utt
+
+    # Prepare scp files of noises and RIRs for training (used for on-the-fly mixing)
+    # * The noise set is composed of two parts:
+    #   (1) selected from MUSAN and Audioset (25390 samples, ~120 hours)
+    #   (2) real meeting room noises recorded by high fidelity devices (98 clips, ~13 hours)
+    #   NOTE: different noise data may have different sample rates.
+    # * 28914 RIRs are simulated using the image method.
+    sed -e 's/\.\(wav\|flac\)//' "${simu_data_path}/train_noise.lst" | \
+        awk -F '/' '{print $NF}' > "${tmpdir}/utt_noise.list"
+    paste -d' ' "${tmpdir}/utt_noise.list" "${simu_data_path}/train_noise.lst" > data/train/noises.scp
+    find "${official_data_dir}/Training_set/train_record_noise/" -iname "*.wav" > "${tmpdir}/train_record_noise.list"
+    sed -e 's/\.\(wav\|flac\)//' "${tmpdir}/train_record_noise.list" | \
+        awk -F '/' '{print $NF}' > "${tmpdir}/utt_record_noise.list"
+    paste -d' ' "${tmpdir}/utt_record_noise.list" "${tmpdir}/train_record_noise.list" >> data/train/noises.scp
+
+    # NOTE: different RIRs may have different numbers of channels.
+    cat "${simu_data_path}"/train_{circle,linear,non_uniform}_rir.lst > "${tmpdir}/train_rir.list"
+    sed -e 's/\.wav//' "${tmpdir}/train_rir.list" | \
+        awk -F '/' '{print $NF}' > "${tmpdir}/utt_rir.list"
+    paste -d' ' "${tmpdir}/utt_rir.list" "${tmpdir}/train_rir.list" > data/train/rirs.scp
+
+    utils/validate_data_dir.sh --no-feats --no-text data/train
+
+    ####################
+    # Development data #
+    ####################
+    mkdir -p data/dev
+    if ${use_official_dev}; then
+        mkdir -p "${tmpdir}"/dev_{simu_circle,simu_linear,simu_non_uniform}
+        for name in linear circle non_uniform; do
+            python local/prepare_dev_data.py \
+                --audiodirs "${simu_data_path}"/wavs/dev/simu_${name}/mix \
+                --use_reverb_ref ${use_reverb_ref} \
+                --outdir "${tmpdir}"/dev_simu_${name} \
+                --uttid_suffix ${name} \
+                "${simu_data_path}"/wavs/dev/simu_${name}/dev_${name}_simu_mix.config
+        done
+        for f in spk1.scp utt2spk wav.scp; do
+            cat "${tmpdir}"/dev_{simu_circle,simu_linear,simu_non_uniform}/${f} | sort > data/dev/${f}
+        done
+    else
+        cat "${simu_data_path}"/wavs/dev/simu_circle/dev_circle_simu_mix.config \
+            "${simu_data_path}"/wavs/dev/simu_linear/dev_linear_simu_mix.config \
+            "${simu_data_path}"/wavs/dev/simu_non_uniform/dev_non_uniform_simu_mix.config \
+            > ${tmpdir}/dev.config
+        python local/prepare_dev_data.py \
+            --audiodirs "${simu_data_path}"/wavs/dev/{simu_circle,simu_linear,simu_non_uniform}/mix \
+            --use_reverb_ref ${use_reverb_ref} \
+            --outdir data/dev \
+            ${tmpdir}/dev.config
+
+        for f in spk1.scp utt2spk wav.scp; do
+            mv data/dev/${f} data/dev/.${f}
+            sort data/dev/.${f} > data/dev/${f}
+            rm data/dev/.${f}
+        done
+    fi
+    utils/utt2spk_to_spk2utt.pl data/dev/utt2spk > data/dev/spk2utt
+    utils/validate_data_dir.sh --no-feats --no-text data/dev
+
+    rm -rf "$tmpdir"
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    log "stage 4: Prepare test data"
+    tmpdir=$(mktemp -d /tmp/conferencingspeech.XXXX)
+    ########################
+    # Evaluation test data #
+    ########################
+    mkdir -p data/test
+    mkdir -p "${tmpdir}"/test_{simu_circle,simu_linear,simu_non_uniform}
+    for name in real-recording semi-real-playback semi-real-realspk; do
+        python local/prepare_test_data.py \
+            --audiodirs "${official_data_dir}"/Evaluation_set/eval_data/task2/${name} \
+            --outdir "${tmpdir}"/test_${name} \
+            --uttid_prefix "task2_${name}" 
+    done
+    for f in spk1.scp utt2spk wav.scp; do
+        cat "${tmpdir}"/test_{real-recording,semi-real-playback,semi-real-realspk}/${f} | sort > data/test/${f}
+    done
+    utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt
+    utils/validate_data_dir.sh --no-feats --no-text data/test
+
+    rm -rf "$tmpdir"
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/conferencingspeech21/enh1/local/fix_simulation_script.patch b/egs2/conferencingspeech21/enh1/local/fix_simulation_script.patch
new file mode 100644
index 00000000000..e9ec83dc19a
--- /dev/null
+++ b/egs2/conferencingspeech21/enh1/local/fix_simulation_script.patch
@@ -0,0 +1,98 @@
+diff --git a/requirements.txt b/requirements.txt
+index 8054697..3012192 100644
+--- a/requirements.txt
++++ b/requirements.txt
+@@ -3,5 +3,4 @@ numpy
+ pystoi
+ pesq
+ scipy
+-pyrirgen
+ librosa
+diff --git a/simulation/mix_wav.py b/simulation/mix_wav.py
+index 1cd05c1..d1cb9b6 100644
+--- a/simulation/mix_wav.py
++++ b/simulation/mix_wav.py
+@@ -79,8 +79,8 @@ def clip_data(data, start, segment_length):
+             tgt[st:st+data.shape[0]] += data
+             st = segment_length//3 * 2
+             tgt[st:st+data.shape[0]] += data
+-        
+-        else:
++
++        elif data_len < segment_length//2:
+             """
+             padding to A_A
+             """
+@@ -92,25 +92,60 @@ def clip_data(data, start, segment_length):
+             tgt[:segment_length//2] += data
+             st = segment_length//2
+             tgt[st:st+data.shape[0]] += data
+-    
++
++        elif data_len < segment_length:
++            # (wangyou) in case of outliers
++            """same as (start == -1)--if"""
++            if data_len % 4 == 0:
++                tgt[:data_len] += data
++                tgt[data_len:] += data[:segment_length-data_len]
++            elif data_len % 4 == 1:
++                tgt[:data_len] += data
++            elif data_len % 4 == 2:
++                tgt[-data_len:] += data
++            elif data_len % 4 == 3:
++                tgt[(segment_length-data_len)//2:(segment_length-data_len)//2+data_len] += data
++
++        else:
++            # (wangyou) in case of outliers
++            """same as (start == -1)--else"""
++            if data_len % 4 == 0 or data_len % 4 == 3:
++                tgt += data[(data_len-segment_length)//2:(data_len-segment_length)//2+segment_length]
++            elif data_len % 4 == 1:
++                tgt += data[:segment_length]
++            elif data_len % 4 == 2:
++                tgt += data[-segment_length:]
++
+     elif start == -1:
+         '''
+         this means segment_length < data_len*2
+         padding to A_A
+         '''
+-        if data_len % 4 == 0:
+-            tgt[:data_len] += data
+-            tgt[data_len:] += data[:segment_length-data_len]
+-        elif data_len % 4 == 1:
+-            tgt[:data_len] += data
+-        elif data_len % 4 == 2:
+-            tgt[-data_len:] += data
+-        elif data_len % 4 == 3:
+-            tgt[(segment_length-data_len)//2:(segment_length-data_len)//2+data_len] += data
+-    
++        if data_len < segment_length:
++            if data_len % 4 == 0:
++                tgt[:data_len] += data
++                tgt[data_len:] += data[:segment_length-data_len]
++            elif data_len % 4 == 1:
++                tgt[:data_len] += data
++            elif data_len % 4 == 2:
++                tgt[-data_len:] += data
++            elif data_len % 4 == 3:
++                tgt[(segment_length-data_len)//2:(segment_length-data_len)//2+data_len] += data
++
++        else:
++            # (wangyou) in case of outliers
++            if data_len % 4 == 0 or data_len % 4 == 3:
++                tgt += data[(data_len-segment_length)//2:(data_len-segment_length)//2+segment_length]
++            elif data_len % 4 == 1:
++                tgt += data[:segment_length]
++            elif data_len % 4 == 2:
++                tgt += data[-segment_length:]
++
+     else:
++        if start + segment_length > data_len:
++            data = np.pad(data, [0, start + segment_length - data_len], 'constant')
+         tgt += data[start:start+segment_length]
+-    
++
+     return tgt
+ 
+ def rms(data):
diff --git a/egs2/conferencingspeech21/enh1/local/path.sh b/egs2/conferencingspeech21/enh1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/conferencingspeech21/enh1/local/prepare.sh b/egs2/conferencingspeech21/enh1/local/prepare.sh
new file mode 100755
index 00000000000..ea17270abe4
--- /dev/null
+++ b/egs2/conferencingspeech21/enh1/local/prepare.sh
@@ -0,0 +1,80 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
+
+corpora_dir=
+selected_list_dir=
+outdir=
+
+echo "$0 $*"
+. utils/parse_options.sh
+
+
+. ./path.sh || exit 1;
+. ./db.sh || exit 1;
+
+
+mkdir -p "${outdir}"
+tmpdir=$(mktemp -d /tmp/cs21.XXXX)
+trap 'rm -rf "$tmpdir"' EXIT
+
+# prepare speech for training
+for name in aishell_3 aishell_1 vctk; do
+    python local/prepare_data_list.py \
+        --outfile "${tmpdir}/train_${name}.lst" \
+        --audiodirs "${corpora_dir}/${name}" \
+        --audio-format "wav" \
+        "${selected_list_dir}/train/${name}.name"
+done
+
+python local/prepare_data_list.py \
+    --outfile "${tmpdir}/train_librispeech_360.lst" \
+    --audiodirs "${corpora_dir}/librispeech_360" \
+    --audio-format "flac" \
+    "${selected_list_dir}/train/librispeech_360.name"
+
+cat "${tmpdir}"/train_{aishell_3,aishell_1,vctk,librispeech_360}.lst > "${outdir}/train_clean.lst"
+
+# prepare noise for training
+python local/prepare_data_list.py \
+    --outfile "${tmpdir}/musan.lst" \
+    --audiodirs "${corpora_dir}/musan" \
+    --audio-format "wav" \
+    "${selected_list_dir}/train/musan.name"
+
+python local/prepare_data_list.py \
+    --outfile "${tmpdir}/audioset.lst" \
+    --audiodirs "${corpora_dir}/audioset" \
+    --audio-format "wav" \
+    --ignore-missing-files True \
+    "${selected_list_dir}/train/audioset.name"
+
+cat "${tmpdir}"/{musan,audioset}.lst > "${outdir}/train_noise.lst"
+
+# prepare speech for development
+python local/prepare_data_list.py \
+    --outfile "${outdir}/dev_clean.lst" \
+    --audiodirs "${corpora_dir}/aishell_1" "${corpora_dir}/vctk" "${corpora_dir}/aishell_3" \
+    --audio-format "wav" \
+    "${selected_list_dir}/dev/clean.name"
+
+# prepare noise for development
+python local/prepare_data_list.py \
+    --outfile "${outdir}/dev_noise.lst" \
+    --audiodirs "${corpora_dir}/musan" \
+    --audio-format "wav" \
+    "${selected_list_dir}/dev/noise.name"
+
+# Prepare the simulated RIR lists for training and development
+for name in linear circle non_uniform; do
+    for mode in train dev; do
+        python local/prepare_data_list.py \
+            --outfile "${outdir}/${mode}_${name}_rir.lst" \
+            --audiodirs "${corpora_dir}/${name}" \
+            --audio-format "wav" \
+            "${selected_list_dir}/${mode}/${name}.name"
+    done
+done
diff --git a/egs2/conferencingspeech21/enh1/local/prepare_data_list.py b/egs2/conferencingspeech21/enh1/local/prepare_data_list.py
new file mode 100755
index 00000000000..5c330d6e99c
--- /dev/null
+++ b/egs2/conferencingspeech21/enh1/local/prepare_data_list.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python
+
+# Copyright 2021  Shanghai Jiao Tong University (Authors: Wangyou Zhang)
+# Apache 2.0
+import argparse
+from pathlib import Path
+
+from espnet2.utils.types import str2bool
+
+
+def prepare_data(args):
+    datalist = Path(args.datalist).expanduser().resolve()
+    audiodirs = [Path(audiodir).expanduser() for audiodir in args.audiodirs]
+    outfile = Path(args.outfile).expanduser().resolve()
+    audios = {
+        path.name: str(path)
+        for audiodir in audiodirs
+        for path in audiodir.rglob("*." + args.audio_format)
+    }
+    missing_files = []
+    with outfile.open("w") as out, datalist.open("r") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            wavname, others = line.split(maxsplit=1)
+            if args.ignore_missing_files:
+                if wavname not in audios:
+                    missing_files.append(wavname)
+                    continue
+            else:
+                assert wavname in audios, "No such file %s in %s" % (
+                    wavname,
+                    str([str(p) for p in audiodirs]),
+                )
+            out.write(audios[wavname] + " " + others + "\n")
+    if args.ignore_missing_files and len(missing_files) > 0:
+        print(
+            "{} wav missing files are skipped:\n{}".format(
+                len(missing_files), "\n  ".join(missing_files)
+            )
+        )
+
+
+def get_parser():
+    """Argument parser."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "datalist", type=str, help="Path to the list of audio files for training"
+    )
+    parser.add_argument("--outfile", type=str, required=True)
+    parser.add_argument(
+        "--audiodirs",
+        type=str,
+        nargs="+",
+        required=True,
+        help="Paths to the directories containing audio files",
+    )
+    parser.add_argument("--audio-format", type=str, default="wav")
+    parser.add_argument("--ignore-missing-files", type=str2bool, default=False)
+    return parser
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+    prepare_data(args)
diff --git a/egs2/conferencingspeech21/enh1/local/prepare_dev_data.py b/egs2/conferencingspeech21/enh1/local/prepare_dev_data.py
new file mode 100755
index 00000000000..7ea801511b8
--- /dev/null
+++ b/egs2/conferencingspeech21/enh1/local/prepare_dev_data.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python
+
+# Copyright 2021  Shanghai Jiao Tong University (Authors: Wangyou Zhang)
+# Apache 2.0
+import argparse
+from pathlib import Path
+import re
+
+from espnet2.fileio.datadir_writer import DatadirWriter
+from espnet2.utils.types import str2bool
+
+
+def prepare_data(args):
+    config_file = Path(args.config_file).expanduser().resolve()
+    audiodirs = [Path(audiodir).expanduser().resolve() for audiodir in args.audiodirs]
+    audios = {
+        path.stem: str(path)
+        for audiodir in audiodirs
+        for path in audiodir.rglob("*.wav")
+    }
+    suffix = "_" + args.uttid_suffix if args.uttid_suffix else ""
+    with DatadirWriter(args.outdir) as writer, config_file.open("r") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+
+            path_clean, start_time, path_noise, path_rir, snr, scale = line.split()
+            uttid = "#".join(
+                [
+                    Path(path_clean).stem,
+                    Path(path_noise).stem,
+                    Path(path_rir).stem,
+                    start_time,
+                    snr,
+                    scale,
+                ]
+            )
+            writer["wav.scp"][uttid + suffix] = audios[uttid]
+            if args.use_reverb_ref:
+                repl = r"/reverb_ref/\1"
+            else:
+                repl = r"/noreverb_ref/\1"
+            writer["spk1.scp"][uttid + suffix] = re.sub(
+                r"/mix/([^\\]+\.wav$)", repl, audios[uttid]
+            )
+            if "librispeech" in path_clean:
+                spkid = "-".join(path_clean.split("/")[-3:-1])
+            else:
+                spkid = path_clean.split("/")[-2]
+            writer["utt2spk"][uttid + suffix] = spkid
+
+
+def get_parser():
+    """Argument parser."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "config_file", type=str, help="Path to the list of audio files for training"
+    )
+    parser.add_argument(
+        "--audiodirs",
+        type=str,
+        nargs="+",
+        required=True,
+        help="Paths to the directories containing simulated audio files",
+    )
+    parser.add_argument(
+        "--uttid_suffix",
+        type=str,
+        default="",
+        help="suffix to be appended to each utterance ID",
+    )
+    parser.add_argument(
+        "--outdir",
+        type=str,
+        required=True,
+        help="Paths to the directory for storing *.scp, utt2spk, spk2utt",
+    )
+    parser.add_argument(
+        "--use_reverb_ref",
+        type=str2bool,
+        default=True,
+        help="True to use reverberant references, False to use non-reverberant ones",
+    )
+    return parser
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+    prepare_data(args)
diff --git a/egs2/conferencingspeech21/enh1/local/prepare_simu_config.py b/egs2/conferencingspeech21/enh1/local/prepare_simu_config.py
new file mode 100755
index 00000000000..ec6ab395f42
--- /dev/null
+++ b/egs2/conferencingspeech21/enh1/local/prepare_simu_config.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python
+
+# Copyright 2021  Shanghai Jiao Tong University (Authors: Wangyou Zhang)
+# Apache 2.0
+import argparse
+from pathlib import Path
+
+
+def construct_path_dict(wav_list):
+    path_dict = {}
+    with wav_list.open("r") as f:
+        for wavpath in f:
+            wavpath = wavpath.strip()
+            if not wavpath:
+                continue
+            wavname = Path(wavpath).expanduser().resolve().name
+            path_dict[wavname] = wavpath
+    return path_dict
+
+
+def prepare_config(args):
+    config = Path(args.config).expanduser().resolve()
+    clean_list = Path(args.clean_list).expanduser().resolve()
+    noise_list = Path(args.noise_list).expanduser().resolve()
+    rir_list = Path(args.rir_list).expanduser().resolve()
+    outfile = Path(args.outfile).expanduser().resolve()
+
+    speech_data = construct_path_dict(clean_list)
+    noise_data = construct_path_dict(noise_list)
+    rir_data = construct_path_dict(rir_list)
+
+    lines = []
+    with config.open("r") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+
+            path_clean, start_time, path_noise, path_rir, snr, scale = line.split()
+            path_clean = speech_data[Path(path_clean).name]
+            path_noise = noise_data[Path(path_noise).name]
+            path_rir = rir_data[Path(path_rir).name]
+            lines.append(
+                f"{path_clean} {start_time} {path_noise} {path_rir} {snr} {scale}\n"
+            )
+
+    with outfile.open("w") as out:
+        for line in lines:
+            out.write(line)
+
+
+def get_parser():
+    """Argument parser."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "config",
+        type=str,
+        help="Path to the config file for simulation",
+    )
+    parser.add_argument(
+        "--clean_list",
+        type=str,
+        required=True,
+        help="Path to the list of clean speech audio file for simulation",
+    )
+    parser.add_argument(
+        "--noise_list",
+        type=str,
+        required=True,
+        help="Path to the list of noise audio file for simulation",
+    )
+    parser.add_argument(
+        "--rir_list",
+        type=str,
+        required=True,
+        help="Path to the list of RIR audio file for simulation",
+    )
+    parser.add_argument("--outfile", type=str, required=True)
+    return parser
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+    prepare_config(args)
diff --git a/egs2/conferencingspeech21/enh1/local/prepare_test_data.py b/egs2/conferencingspeech21/enh1/local/prepare_test_data.py
new file mode 100755
index 00000000000..974c25d812c
--- /dev/null
+++ b/egs2/conferencingspeech21/enh1/local/prepare_test_data.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python
+
+# Copyright 2021  Shanghai Jiao Tong University (Authors: Jing Shi)
+# Apache 2.0
+import argparse
+from pathlib import Path
+
+from espnet2.fileio.datadir_writer import DatadirWriter
+
+
+def prepare_data(args):
+    audiodirs = [Path(audiodir).expanduser().resolve() for audiodir in args.audiodirs]
+    if args.uttid_prefix:
+        audios = {
+            "_".join([args.uttid_prefix, str(path.parent.stem), str(path.stem)]): str(
+                path
+            )
+            for audiodir in audiodirs
+            for path in audiodir.rglob("*.wav")
+        }
+    else:
+        audios = {
+            "_".join([path.parent, path.stem]): str(path)
+            for audiodir in audiodirs
+            for path in audiodir.rglob("*.wav")
+        }
+    with DatadirWriter(args.outdir) as writer:
+        for uttid, utt_path in audios.items():
+            writer["wav.scp"][uttid] = utt_path
+            writer["spk1.scp"][uttid] = utt_path
+            writer["utt2spk"][uttid] = uttid
+
+
+def get_parser():
+    """Argument parser."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--audiodirs",
+        type=str,
+        nargs="+",
+        required=True,
+        help="Paths to the directories containing simulated audio files",
+    )
+    parser.add_argument(
+        "--uttid_prefix",
+        type=str,
+        default="",
+        help="Prefix to be appended to each utterance ID",
+    )
+    parser.add_argument(
+        "--outdir",
+        type=str,
+        required=True,
+        help="Paths to the directory for storing *.scp, utt2spk, spk2utt",
+    )
+    return parser
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+    prepare_data(args)
diff --git a/egs2/conferencingspeech21/enh1/path.sh b/egs2/conferencingspeech21/enh1/path.sh
new file mode 120000
index 00000000000..eb217d35673
--- /dev/null
+++ b/egs2/conferencingspeech21/enh1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/enh1/path.sh
\ No newline at end of file
diff --git a/egs2/conferencingspeech21/enh1/pyscripts b/egs2/conferencingspeech21/enh1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/conferencingspeech21/enh1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/conferencingspeech21/enh1/run.sh b/egs2/conferencingspeech21/enh1/run.sh
new file mode 100755
index 00000000000..b978635a8e2
--- /dev/null
+++ b/egs2/conferencingspeech21/enh1/run.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+
+# run local/data.sh for more information
+official_data_dir=/path/to/ConferencingSpeech2021_data
+sample_rate=16k
+
+train_set=train
+valid_set=dev
+test_sets="test"
+
+./enh.sh \
+    --audio_format wav \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --fs ${sample_rate} \
+    --ngpu 1 \
+    --spk_num 1 \
+    --local_data_opts "--official_data_dir ${official_data_dir}" \
+    --enh_config conf/tuning/train_enh_beamformer_mvdr.yaml \
+    --use_dereverb_ref false \
+    --use_noise_ref false \
+    --inference_model "valid.loss.best.pth" \
+    "$@"
diff --git a/egs2/conferencingspeech21/enh1/scripts b/egs2/conferencingspeech21/enh1/scripts
new file mode 120000
index 00000000000..9aeb3f26509
--- /dev/null
+++ b/egs2/conferencingspeech21/enh1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/enh1/scripts
\ No newline at end of file
diff --git a/egs2/conferencingspeech21/enh1/steps b/egs2/conferencingspeech21/enh1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/conferencingspeech21/enh1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/conferencingspeech21/enh1/utils b/egs2/conferencingspeech21/enh1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/conferencingspeech21/enh1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/covost2/asr1/asr.sh b/egs2/covost2/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/covost2/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/covost2/asr1/cmd.sh b/egs2/covost2/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/covost2/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/covost2/asr1/conf/decode_asr.yaml b/egs2/covost2/asr1/conf/decode_asr.yaml
new file mode 120000
index 00000000000..f3f59d5ac2b
--- /dev/null
+++ b/egs2/covost2/asr1/conf/decode_asr.yaml
@@ -0,0 +1 @@
+tuning/decode_transformer.yaml
\ No newline at end of file
diff --git a/egs2/covost2/asr1/conf/fbank.conf b/egs2/covost2/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/covost2/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/covost2/asr1/conf/pbs.conf b/egs2/covost2/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/covost2/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/covost2/asr1/conf/pitch.conf b/egs2/covost2/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/covost2/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/covost2/asr1/conf/queue.conf b/egs2/covost2/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/covost2/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/covost2/asr1/conf/slurm.conf b/egs2/covost2/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/covost2/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/covost2/asr1/conf/train_asr.yaml b/egs2/covost2/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..00ea2f24791
--- /dev/null
+++ b/egs2/covost2/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+tuning/train_asr_transformer.yaml
\ No newline at end of file
diff --git a/egs2/covost2/asr1/conf/train_lm.yaml b/egs2/covost2/asr1/conf/train_lm.yaml
new file mode 100644
index 00000000000..5bc7b7fcbfc
--- /dev/null
+++ b/egs2/covost2/asr1/conf/train_lm.yaml
@@ -0,0 +1,14 @@
+lm_conf:
+    nlayers: 2
+    unit: 650
+optim: sgd        # or adam
+batch_type: folded
+batch_size: 64   # batch size in LM training
+max_epoch: 20      # if the data size is large, we can reduce this
+patience: 3
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
diff --git a/egs2/covost2/asr1/conf/tuning/decode_transformer.yaml b/egs2/covost2/asr1/conf/tuning/decode_transformer.yaml
new file mode 100644
index 00000000000..d89db079882
--- /dev/null
+++ b/egs2/covost2/asr1/conf/tuning/decode_transformer.yaml
@@ -0,0 +1,7 @@
+batch_size: 1
+beam_size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.5
+lm_weight: 0.3
diff --git a/egs2/covost2/asr1/conf/tuning/train_asr_transformer.yaml b/egs2/covost2/asr1/conf/tuning/train_asr_transformer.yaml
new file mode 100644
index 00000000000..a889e110cf3
--- /dev/null
+++ b/egs2/covost2/asr1/conf/tuning/train_asr_transformer.yaml
@@ -0,0 +1,52 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: "conv2d"
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+
+# decoder related
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: "embed"
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# minibatch related
+batch_type: folded
+batch_size: 32
+
+# optimization related
+optim: adam
+accum_grad: 2
+grad_clip: 5
+patience: 0
+max_epoch: 100
+optim_conf:
+    lr: 1.0
+scheduler: noamlr
+scheduler_conf:
+    warmup_steps: 25000
+
+# others:
+best_model_criterion:
+-  - valid
+   - acc
+   - max
+keep_nbest_models: 10
+
+init: xavier_uniform # Our empirical studies shows that this initialization
+                     # is very important to low-resource ASR training
diff --git a/egs2/covost2/asr1/db.sh b/egs2/covost2/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/covost2/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/covost2/asr1/local/data.sh b/egs2/covost2/asr1/local/data.sh
new file mode 100755
index 00000000000..bae3393ae0b
--- /dev/null
+++ b/egs2/covost2/asr1/local/data.sh
@@ -0,0 +1,84 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Carnegie Mellon University (Jiatong Shi)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+# general configuration
+stage=0       # start from 0 if you need to start from data preparation
+stop_stage=100
+SECONDS=0
+src_lang=es
+tgt_lang=en
+
+ . utils/parse_options.sh || exit 1;
+
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+if [ -z "${COVOST2}" ]; then
+    log "Fill the value of 'COVOST2' of db.sh"
+    exit 1
+fi
+mkdir -p ${COVOST2}
+
+if [ -z "${COMMONVOICE}" ]; then
+    log "Fill the value of 'COMMONVOICE' of db.sh"
+    exit 1
+fi
+mkdir -p ${COMMONVOICE}
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "stage 0: Data Downloading"
+
+    # base url for downloads.
+    data_url=https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-4-2019-12-10/${src_lang}.tar.gz
+
+    # Download CommonVoice
+    mkdir -p ${COMMONVOICE}/${src_lang}
+    local/download_and_untar_commonvoice.sh ${COMMONVOICE}/${src_lang} ${data_url} ${src_lang}.tar.gz
+
+    # Download translation
+    if [[ ${src_lang} != en ]]; then
+        wget --no-check-certificate https://dl.fbaipublicfiles.com/covost/covost_v2.${src_lang}_${tgt_lang}.tsv.tar.gz \
+            -P ${COVOST2}
+        tar -xzf ${COVOST2}/covost_v2.${src_lang}_${tgt_lang}.tsv.tar.gz -C ${COVOST2}
+    fi
+    wget --no-check-certificate https://dl.fbaipublicfiles.com/covost/covost2.zip \
+          -P ${COVOST2}
+    unzip ${COVOST2}/covost2.zip -d ${COVOST2}
+    # NOTE: some non-English target languages lack translation from English
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: Data Preparation"
+    # use underscore-separated names in data directories.
+    local/data_prep_commonvoice.pl "${COMMONVOICE}/${src_lang}" validated data/validated.${src_lang}
+
+    # text preprocessing (tokenization, case, punctuation marks etc.)
+    local/data_prep_covost2.sh ${COVOST2} ${src_lang} ${tgt_lang} || exit 1;
+    # NOTE: train/dev/test splits are different from original CommonVoice
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: ASR Adaptation"
+    for x in train dev test; do
+        cp data/${x}.${src_lang}-${tgt_lang}/text.lc.rm.${src_lang} data/${x}.${src_lang}-${tgt_lang}/text
+    done
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/covost2/asr1/local/data_prep_commonvoice.pl b/egs2/covost2/asr1/local/data_prep_commonvoice.pl
new file mode 120000
index 00000000000..665a6fbfa65
--- /dev/null
+++ b/egs2/covost2/asr1/local/data_prep_commonvoice.pl
@@ -0,0 +1 @@
+../../../../egs/covost2/asr1/local/data_prep_commonvoice.pl
\ No newline at end of file
diff --git a/egs2/covost2/asr1/local/data_prep_covost2.sh b/egs2/covost2/asr1/local/data_prep_covost2.sh
new file mode 120000
index 00000000000..ab5069cc162
--- /dev/null
+++ b/egs2/covost2/asr1/local/data_prep_covost2.sh
@@ -0,0 +1 @@
+../../../../egs/covost2/asr1/local/data_prep_covost2.sh
\ No newline at end of file
diff --git a/egs2/covost2/asr1/local/download_and_untar_commonvoice.sh b/egs2/covost2/asr1/local/download_and_untar_commonvoice.sh
new file mode 120000
index 00000000000..2c59b3f2047
--- /dev/null
+++ b/egs2/covost2/asr1/local/download_and_untar_commonvoice.sh
@@ -0,0 +1 @@
+../../../../egs/covost2/asr1/local/download_and_untar_commonvoice.sh
\ No newline at end of file
diff --git a/egs2/covost2/asr1/local/path.sh b/egs2/covost2/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/covost2/asr1/local/process_tsv.py b/egs2/covost2/asr1/local/process_tsv.py
new file mode 120000
index 00000000000..80316c81614
--- /dev/null
+++ b/egs2/covost2/asr1/local/process_tsv.py
@@ -0,0 +1 @@
+../../../../egs/covost2/asr1/local/process_tsv.py
\ No newline at end of file
diff --git a/egs2/covost2/asr1/path.sh b/egs2/covost2/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/covost2/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/covost2/asr1/pyscripts b/egs2/covost2/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/covost2/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/covost2/asr1/run.sh b/egs2/covost2/asr1/run.sh
new file mode 100755
index 00000000000..e40acf30dd7
--- /dev/null
+++ b/egs2/covost2/asr1/run.sh
@@ -0,0 +1,99 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+# language related
+src_lang=es
+tgt_lang=en
+# English (en)
+# French (fr)
+# German (de)
+# Spanish (es)
+# Catalan (ca)
+# Italian (it)
+# Russian (ru)
+# Chinese (zh-CN)
+# Portuguese (pt)
+# Persian (fa)
+# Estonian (et)
+# Mongolian (mn)
+# Dutch (nl)
+# Turkish (tr)
+# Arabic (ar)
+# Swedish (sv-SE)
+# Latvian (lv)
+# Slovenian (sl)
+# Tamil (ta)
+# Japanese (ja)
+# Indonesian (id)
+# Welsh (cy)
+nbpe=1000
+
+train_set=train.${src_lang}-${tgt_lang}
+train_dev=dev.${src_lang}-${tgt_lang}
+test_set="test.${src_lang}-${tgt_lang} dev.${src_lang}-${tgt_lang} "
+
+asr_config=conf/train_asr.yaml
+lm_config=conf/train_lm.yaml
+inference_config=conf/decode_asr.yaml
+
+# verify language directions
+is_exist=false
+is_low_resource=false
+if [[ ${src_lang} == en ]]; then
+    tgt_langs=de_ca_zh-CN_fa_et_mn_tr_ar_sv-SE_lv_sl_ta_ja_id_cy
+    for lang in $(echo ${tgt_langs} | tr '_' ' '); do
+        if [[ ${lang} == "${tgt_lang}" ]]; then
+            is_exist=true
+            break
+        fi
+    done
+else
+    lr_src_langs=it_ru_zh-CN_pt_fa_et_mn_nl_tr_ar_sv-SE_lv_sl_ta_ja_id_cy
+    for lang in $(echo ${lr_src_langs} | tr '_' ' '); do
+        if [[ ${lang} == "${src_lang}" ]]; then
+            is_low_resource=true
+            break
+        fi
+    done
+    src_langs=fr_de_es_ca_it_ru_zh-CN_pt_fa_et_mn_nl_tr_ar_sv-SE_lv_sl_ta_ja_id_cy
+    for lang in $(echo ${src_langs} | tr '_' ' '); do
+        if [[ ${lang} == "${src_lang}" ]]; then
+            is_exist=true
+            break
+        fi
+    done
+fi
+if [[ ${is_exist} == false ]]; then
+    echo "No language direction: ${src_lang} to ${tgt_lang}" && exit 1;
+fi
+
+if [ ${is_low_resource} = true ]; then
+    speed_perturb_factors="0.9 1.0 1.1"
+else
+    speed_perturb_factors="0.8 0.9 1.0 1.1 1.2"
+fi
+
+if [ ${src_lang} == ja ] || [ ${src_lang} == zh-CN ]; then
+    nbpe=4000
+fi
+
+./asr.sh \
+    --ngpu 1 \
+    --local_data_opts "--stage 0 --src_lang ${src_lang} --tgt_lang ${tgt_lang}" \
+    --use_lm false \
+    --lm_config "${lm_config}" \
+    --token_type bpe \
+    --nbpe $nbpe \
+    --feats_type raw \
+    --speed_perturb_factors "${speed_perturb_factors}" \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${train_dev}" \
+    --test_sets "${test_set}" \
+    --bpe_train_text "data/${train_set}/text" \
+    --lm_train_text "data/${train_set}/text" "$@"
diff --git a/egs2/covost2/asr1/scripts b/egs2/covost2/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/covost2/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/covost2/asr1/steps b/egs2/covost2/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/covost2/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/covost2/asr1/utils b/egs2/covost2/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/covost2/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/covost2/mt1/cmd.sh b/egs2/covost2/mt1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/covost2/mt1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/covost2/mt1/conf/decode_mt.yaml b/egs2/covost2/mt1/conf/decode_mt.yaml
new file mode 100644
index 00000000000..6570a89920d
--- /dev/null
+++ b/egs2/covost2/mt1/conf/decode_mt.yaml
@@ -0,0 +1,5 @@
+beam_size: 5
+lm_weight: 0.0
+maxlenratio: 1.6
+minlenratio: 0.0
+penalty: 0.2
diff --git a/egs2/covost2/mt1/conf/pbs.conf b/egs2/covost2/mt1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/covost2/mt1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/covost2/mt1/conf/queue.conf b/egs2/covost2/mt1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/covost2/mt1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/covost2/mt1/conf/slurm.conf b/egs2/covost2/mt1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/covost2/mt1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/covost2/mt1/conf/train_mt.yaml b/egs2/covost2/mt1/conf/train_mt.yaml
new file mode 120000
index 00000000000..050cda0e4d0
--- /dev/null
+++ b/egs2/covost2/mt1/conf/train_mt.yaml
@@ -0,0 +1 @@
+tuning/train_mt_transformer_lr3e-3_warmup10k_share_enc_dec_input_dropout0.3.yaml
\ No newline at end of file
diff --git a/egs2/covost2/mt1/conf/tuning/train_mt_transformer_lr3e-3_warmup10k_share_enc_dec_input_dropout0.3.yaml b/egs2/covost2/mt1/conf/tuning/train_mt_transformer_lr3e-3_warmup10k_share_enc_dec_input_dropout0.3.yaml
new file mode 100644
index 00000000000..8b2d8844238
--- /dev/null
+++ b/egs2/covost2/mt1/conf/tuning/train_mt_transformer_lr3e-3_warmup10k_share_enc_dec_input_dropout0.3.yaml
@@ -0,0 +1,59 @@
+frontend: embed     # embedding + positional encoding
+frontend_conf:
+    embed_dim: 512
+    positional_dropout_rate: 0.3
+
+encoder: transformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 6
+    dropout_rate: 0.3
+    positional_dropout_rate: 0.3
+    attention_dropout_rate: 0.3
+    input_layer: null
+    normalize_before: true
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 6
+    dropout_rate: 0.3
+    positional_dropout_rate: 0.3
+    self_attention_dropout_rate: 0.3
+    src_attention_dropout_rate: 0.3
+
+model_conf:
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    share_decoder_input_output_embed: false
+    share_encoder_decoder_input_embed: true
+
+num_att_plot: 1
+log_interval: 100
+num_workers: 2
+batch_type: numel
+batch_bins: 400000000
+accum_grad: 1
+max_epoch: 200
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+    lr: 0.003
+    betas:
+    - 0.9
+    - 0.98
+    eps: 0.000000001
+    weight_decay: 0.0001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 10000
diff --git a/egs2/covost2/mt1/db.sh b/egs2/covost2/mt1/db.sh
new file mode 120000
index 00000000000..a11c0666fa1
--- /dev/null
+++ b/egs2/covost2/mt1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/mt1/db.sh
\ No newline at end of file
diff --git a/egs2/covost2/mt1/local/data.sh b/egs2/covost2/mt1/local/data.sh
new file mode 120000
index 00000000000..18a913cdc75
--- /dev/null
+++ b/egs2/covost2/mt1/local/data.sh
@@ -0,0 +1 @@
+../../st1/local/data.sh
\ No newline at end of file
diff --git a/egs2/covost2/mt1/local/data_prep_commonvoice.pl b/egs2/covost2/mt1/local/data_prep_commonvoice.pl
new file mode 120000
index 00000000000..67d2c51579c
--- /dev/null
+++ b/egs2/covost2/mt1/local/data_prep_commonvoice.pl
@@ -0,0 +1 @@
+../../st1/local/data_prep_commonvoice.pl
\ No newline at end of file
diff --git a/egs2/covost2/mt1/local/data_prep_covost2.sh b/egs2/covost2/mt1/local/data_prep_covost2.sh
new file mode 120000
index 00000000000..21b916c502a
--- /dev/null
+++ b/egs2/covost2/mt1/local/data_prep_covost2.sh
@@ -0,0 +1 @@
+../../st1/local/data_prep_covost2.sh
\ No newline at end of file
diff --git a/egs2/covost2/mt1/local/download_and_untar_commonvoice.sh b/egs2/covost2/mt1/local/download_and_untar_commonvoice.sh
new file mode 120000
index 00000000000..6979fcaffea
--- /dev/null
+++ b/egs2/covost2/mt1/local/download_and_untar_commonvoice.sh
@@ -0,0 +1 @@
+../../st1/local/download_and_untar_commonvoice.sh
\ No newline at end of file
diff --git a/egs2/covost2/mt1/local/path.sh b/egs2/covost2/mt1/local/path.sh
new file mode 120000
index 00000000000..476483f066c
--- /dev/null
+++ b/egs2/covost2/mt1/local/path.sh
@@ -0,0 +1 @@
+../../st1/local/path.sh
\ No newline at end of file
diff --git a/egs2/covost2/mt1/local/process_tsv.py b/egs2/covost2/mt1/local/process_tsv.py
new file mode 120000
index 00000000000..8ec19bcb081
--- /dev/null
+++ b/egs2/covost2/mt1/local/process_tsv.py
@@ -0,0 +1 @@
+../../st1/local/process_tsv.py
\ No newline at end of file
diff --git a/egs2/covost2/mt1/mt.sh b/egs2/covost2/mt1/mt.sh
new file mode 120000
index 00000000000..9f4c1d5c0bb
--- /dev/null
+++ b/egs2/covost2/mt1/mt.sh
@@ -0,0 +1 @@
+../../TEMPLATE/mt1/mt.sh
\ No newline at end of file
diff --git a/egs2/covost2/mt1/path.sh b/egs2/covost2/mt1/path.sh
new file mode 120000
index 00000000000..a2d87d29a46
--- /dev/null
+++ b/egs2/covost2/mt1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/mt1/path.sh
\ No newline at end of file
diff --git a/egs2/covost2/mt1/pyscripts b/egs2/covost2/mt1/pyscripts
new file mode 120000
index 00000000000..bca5bde44f3
--- /dev/null
+++ b/egs2/covost2/mt1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/mt1/pyscripts
\ No newline at end of file
diff --git a/egs2/covost2/mt1/run.sh b/egs2/covost2/mt1/run.sh
new file mode 100755
index 00000000000..11265d7ec87
--- /dev/null
+++ b/egs2/covost2/mt1/run.sh
@@ -0,0 +1,99 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+# language related
+src_lang=es
+tgt_lang=en
+# English (en)
+# French (fr)
+# German (de)
+# Spanish (es)
+# Catalan (ca)
+# Italian (it)
+# Russian (ru)
+# Chinese (zh-CN)
+# Portuguese (pt)
+# Persian (fa)
+# Estonian (et)
+# Mongolian (mn)
+# Dutch (nl)
+# Turkish (tr)
+# Arabic (ar)
+# Swedish (sv-SE)
+# Latvian (lv)
+# Slovenian (sl)
+# Tamil (ta)
+# Japanese (ja)
+# Indonesian (id)
+# Welsh (cy)
+
+src_nbpe=1000
+tgt_nbpe=1000
+src_case=lc.rm
+tgt_case=lc.rm
+
+train_set=train.${src_lang}-${tgt_lang}
+train_dev=dev.${src_lang}-${tgt_lang}
+test_set="test.${src_lang}-${tgt_lang} dev.${src_lang}-${tgt_lang} "
+
+mt_config=conf/train_mt.yaml
+inference_config=conf/decode_mt.yaml
+
+# verify language directions
+is_exist=false
+if [[ ${src_lang} == en ]]; then
+    tgt_langs=de_ca_zh-CN_fa_et_mn_tr_ar_sv-SE_lv_sl_ta_ja_id_cy
+    for lang in $(echo ${tgt_langs} | tr '_' ' '); do
+        if [[ ${lang} == "${tgt_lang}" ]]; then
+            is_exist=true
+            break
+        fi
+    done
+else
+    src_langs=fr_de_es_ca_it_ru_zh-CN_pt_fa_et_mn_nl_tr_ar_sv-SE_lv_sl_ta_ja_id_cy
+    for lang in $(echo ${src_langs} | tr '_' ' '); do
+        if [[ ${lang} == "${src_lang}" ]]; then
+            is_exist=true
+            break
+        fi
+    done
+fi
+if [[ ${is_exist} == false ]]; then
+    echo "No language direction: ${src_lang} to ${tgt_lang}" && exit 1;
+fi
+
+if [ ${src_lang} == ja ] || [ ${src_lang} == zh-CN ]; then
+    src_nbpe=4000
+fi
+
+if [ ${tgt_lang} == ja ] || [ ${tgt_lang} == zh-CN ]; then
+    tgt_nbpe=4000
+fi
+
+./mt.sh \
+    --ngpu 1 \
+    --local_data_opts "--stage 0 --src_lang ${src_lang} --tgt_lang ${tgt_lang}" \
+    --use_lm false \
+    --feats_type raw \
+    --audio_format "flac.ark" \
+    --token_joint false \
+    --src_lang ${src_lang} \
+    --tgt_lang ${tgt_lang} \
+    --src_token_type "bpe" \
+    --src_nbpe $src_nbpe \
+    --tgt_token_type "bpe" \
+    --tgt_nbpe $tgt_nbpe \
+    --src_case ${src_case} \
+    --tgt_case ${tgt_case} \
+    --mt_config "${mt_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${train_dev}" \
+    --test_sets "${test_set}" \
+    --src_bpe_train_text "data/${train_set}/text.${src_case}.${src_lang}" \
+    --tgt_bpe_train_text "data/${train_set}/text.${tgt_case}.${tgt_lang}" \
+    --lm_train_text "data/${train_set}/text.${tgt_case}.${tgt_lang}"  "$@"
diff --git a/egs2/covost2/mt1/scripts b/egs2/covost2/mt1/scripts
new file mode 120000
index 00000000000..1c11b3c3c7b
--- /dev/null
+++ b/egs2/covost2/mt1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/mt1/scripts
\ No newline at end of file
diff --git a/egs2/covost2/mt1/steps b/egs2/covost2/mt1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/covost2/mt1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/covost2/mt1/utils b/egs2/covost2/mt1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/covost2/mt1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/covost2/st1/cmd.sh b/egs2/covost2/st1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/covost2/st1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/covost2/st1/conf/decode_st.yaml b/egs2/covost2/st1/conf/decode_st.yaml
new file mode 100644
index 00000000000..2967ee6fc0f
--- /dev/null
+++ b/egs2/covost2/st1/conf/decode_st.yaml
@@ -0,0 +1,5 @@
+batch_size: 1
+beam_size: 10
+nbest: 1
+lm_weight: 0.0
+
diff --git a/egs2/covost2/st1/conf/fbank.conf b/egs2/covost2/st1/conf/fbank.conf
new file mode 100644
index 00000000000..d75ddde4df8
--- /dev/null
+++ b/egs2/covost2/st1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=8000 
+--num-mel-bins=80
diff --git a/egs2/covost2/st1/conf/pbs.conf b/egs2/covost2/st1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/covost2/st1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/covost2/st1/conf/pitch.conf b/egs2/covost2/st1/conf/pitch.conf
new file mode 100644
index 00000000000..926bcfca92a
--- /dev/null
+++ b/egs2/covost2/st1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=8000
diff --git a/egs2/covost2/st1/conf/queue.conf b/egs2/covost2/st1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/covost2/st1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/covost2/st1/conf/slurm.conf b/egs2/covost2/st1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/covost2/st1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/covost2/st1/conf/train_st.yaml b/egs2/covost2/st1/conf/train_st.yaml
new file mode 120000
index 00000000000..9e97e84c995
--- /dev/null
+++ b/egs2/covost2/st1/conf/train_st.yaml
@@ -0,0 +1 @@
+tuning/train_transformer_st.yaml
\ No newline at end of file
diff --git a/egs2/covost2/st1/conf/tuning/train_transformer_st.yaml b/egs2/covost2/st1/conf/tuning/train_transformer_st.yaml
new file mode 100644
index 00000000000..168a6a7a174
--- /dev/null
+++ b/egs2/covost2/st1/conf/tuning/train_transformer_st.yaml
@@ -0,0 +1,87 @@
+# network architecture
+
+# frontend related
+frontend: default
+frontend_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 160
+
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: conv2d
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: embed
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+extra_asr_decoder: transformer
+extra_asr_decoder_conf:
+    input_layer: embed
+    num_blocks: 2
+    linear_units: 2048
+    dropout_rate: 0.1
+
+extra_mt_decoder: transformer
+extra_mt_decoder_conf:
+    input_layer: embed
+    num_blocks: 2
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# loss related
+model_conf:
+    asr_weight: 0.3
+    mt_weight: 0.0
+    mtlalpha: 1.0
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# optimization related
+optim: adam
+accum_grad: 2
+grad_clip: 3
+max_epoch: 50
+optim_conf:
+    lr: 2.5
+scheduler: noamlr
+scheduler_conf:
+    model_size: 256
+    warmup_steps: 25000
+
+# minibatch related
+batch_type: folded
+batch_bins: 64
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/covost2/st1/db.sh b/egs2/covost2/st1/db.sh
new file mode 120000
index 00000000000..39ab78266e5
--- /dev/null
+++ b/egs2/covost2/st1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/st1/db.sh
\ No newline at end of file
diff --git a/egs2/covost2/st1/local/data.sh b/egs2/covost2/st1/local/data.sh
new file mode 100755
index 00000000000..ca0912e743d
--- /dev/null
+++ b/egs2/covost2/st1/local/data.sh
@@ -0,0 +1,78 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Carnegie Mellon University (Jiatong Shi)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+# general configuration
+stage=0       # start from 0 if you need to start from data preparation
+stop_stage=100
+SECONDS=0
+src_lang=es
+tgt_lang=en
+
+ . utils/parse_options.sh || exit 1;
+
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+if [ -z "${COVOST2}" ]; then
+    log "Fill the value of 'COVOST2' of db.sh"
+    exit 1
+fi
+mkdir -p ${COVOST2}
+
+if [ -z "${COMMONVOICE}" ]; then
+    log "Fill the value of 'COMMONVOICE' of db.sh"
+    exit 1
+fi
+mkdir -p ${COMMONVOICE}
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "stage 0: Data Downloading"
+
+    # base url for downloads.
+    data_url=https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-4-2019-12-10/${src_lang}.tar.gz
+
+    # Download CommonVoice
+    mkdir -p ${COMMONVOICE}/${src_lang}
+    local/download_and_untar_commonvoice.sh ${COMMONVOICE}/${src_lang} ${data_url} ${src_lang}.tar.gz
+
+    # Download translation
+    if [[ ${src_lang} != en ]]; then
+        wget --no-check-certificate https://dl.fbaipublicfiles.com/covost/covost_v2.${src_lang}_${tgt_lang}.tsv.tar.gz \
+            -P ${COVOST2}
+        tar -xzf ${COVOST2}/covost_v2.${src_lang}_${tgt_lang}.tsv.tar.gz -C ${COVOST2}
+    fi
+    wget --no-check-certificate https://dl.fbaipublicfiles.com/covost/covost2.zip \
+          -P ${COVOST2}
+    unzip ${COVOST2}/covost2.zip -d ${COVOST2}
+    # NOTE: some non-English target languages lack translation from English
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: Data Preparation"
+    # use underscore-separated names in data directories.
+    local/data_prep_commonvoice.pl "${COMMONVOICE}/${src_lang}" validated data/validated.${src_lang}
+
+    # text preprocessing (tokenization, case, punctuation marks etc.)
+    local/data_prep_covost2.sh ${COVOST2} ${src_lang} ${tgt_lang} || exit 1;
+    # NOTE: train/dev/test splits are different from original CommonVoice
+fi
+
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/covost2/st1/local/data_prep_commonvoice.pl b/egs2/covost2/st1/local/data_prep_commonvoice.pl
new file mode 120000
index 00000000000..5e053ed441f
--- /dev/null
+++ b/egs2/covost2/st1/local/data_prep_commonvoice.pl
@@ -0,0 +1 @@
+../../asr1/local/data_prep_commonvoice.pl
\ No newline at end of file
diff --git a/egs2/covost2/st1/local/data_prep_covost2.sh b/egs2/covost2/st1/local/data_prep_covost2.sh
new file mode 120000
index 00000000000..bc9b98f915d
--- /dev/null
+++ b/egs2/covost2/st1/local/data_prep_covost2.sh
@@ -0,0 +1 @@
+../../asr1/local/data_prep_covost2.sh
\ No newline at end of file
diff --git a/egs2/covost2/st1/local/download_and_untar_commonvoice.sh b/egs2/covost2/st1/local/download_and_untar_commonvoice.sh
new file mode 120000
index 00000000000..021e6f9a4f1
--- /dev/null
+++ b/egs2/covost2/st1/local/download_and_untar_commonvoice.sh
@@ -0,0 +1 @@
+../../asr1/local/download_and_untar_commonvoice.sh
\ No newline at end of file
diff --git a/egs2/covost2/st1/local/path.sh b/egs2/covost2/st1/local/path.sh
new file mode 120000
index 00000000000..d404bdb839c
--- /dev/null
+++ b/egs2/covost2/st1/local/path.sh
@@ -0,0 +1 @@
+../../asr1/local/path.sh
\ No newline at end of file
diff --git a/egs2/covost2/st1/local/process_tsv.py b/egs2/covost2/st1/local/process_tsv.py
new file mode 120000
index 00000000000..2710e325be5
--- /dev/null
+++ b/egs2/covost2/st1/local/process_tsv.py
@@ -0,0 +1 @@
+../../asr1/local/process_tsv.py
\ No newline at end of file
diff --git a/egs2/covost2/st1/path.sh b/egs2/covost2/st1/path.sh
new file mode 120000
index 00000000000..8e43dca7d4d
--- /dev/null
+++ b/egs2/covost2/st1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/st1/path.sh
\ No newline at end of file
diff --git a/egs2/covost2/st1/pyscripts b/egs2/covost2/st1/pyscripts
new file mode 120000
index 00000000000..f8eb6b2ab7b
--- /dev/null
+++ b/egs2/covost2/st1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/st1/pyscripts
\ No newline at end of file
diff --git a/egs2/covost2/st1/run.sh b/egs2/covost2/st1/run.sh
new file mode 100755
index 00000000000..5cd66dbaf53
--- /dev/null
+++ b/egs2/covost2/st1/run.sh
@@ -0,0 +1,114 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+# language related
+src_lang=es
+tgt_lang=en
+# English (en)
+# French (fr)
+# German (de)
+# Spanish (es)
+# Catalan (ca)
+# Italian (it)
+# Russian (ru)
+# Chinese (zh-CN)
+# Portuguese (pt)
+# Persian (fa)
+# Estonian (et)
+# Mongolian (mn)
+# Dutch (nl)
+# Turkish (tr)
+# Arabic (ar)
+# Swedish (sv-SE)
+# Latvian (lv)
+# Slovenian (sl)
+# Tamil (ta)
+# Japanese (ja)
+# Indonesian (id)
+# Welsh (cy)
+
+src_nbpe=1000
+tgt_nbpe=1000
+src_case=lc.rm
+tgt_case=lc.rm
+
+train_set=train.${src_lang}-${tgt_lang}
+train_dev=dev.${src_lang}-${tgt_lang}
+test_set="test.${src_lang}-${tgt_lang} dev.${src_lang}-${tgt_lang} "
+
+st_config=conf/train_st.yaml
+inference_config=conf/decode_st.yaml
+
+# verify language directions
+is_exist=false
+is_low_resource=false
+if [[ ${src_lang} == en ]]; then
+    tgt_langs=de_ca_zh-CN_fa_et_mn_tr_ar_sv-SE_lv_sl_ta_ja_id_cy
+    for lang in $(echo ${tgt_langs} | tr '_' ' '); do
+        if [[ ${lang} == "${tgt_lang}" ]]; then
+            is_exist=true
+            break
+        fi
+    done
+else
+    lr_src_langs=it_ru_zh-CN_pt_fa_et_mn_nl_tr_ar_sv-SE_lv_sl_ta_ja_id_cy
+    for lang in $(echo ${lr_src_langs} | tr '_' ' '); do
+        if [[ ${lang} == "${src_lang}" ]]; then
+            is_low_resource=true
+            break
+        fi
+    done
+    src_langs=fr_de_es_ca_it_ru_zh-CN_pt_fa_et_mn_nl_tr_ar_sv-SE_lv_sl_ta_ja_id_cy
+    for lang in $(echo ${src_langs} | tr '_' ' '); do
+        if [[ ${lang} == "${src_lang}" ]]; then
+            is_exist=true
+            break
+        fi
+    done
+fi
+if [[ ${is_exist} == false ]]; then
+    echo "No language direction: ${src_lang} to ${tgt_lang}" && exit 1;
+fi
+
+if [ ${is_low_resource} = true ]; then
+    speed_perturb_factors="0.9 1.0 1.1"
+else
+    speed_perturb_factors="0.8 0.9 1.0 1.1 1.2"
+fi
+
+if [ ${src_lang} == ja ] || [ ${src_lang} == zh-CN ]; then
+    src_nbpe=4000
+fi
+
+if [ ${tgt_lang} == ja ] || [ ${tgt_lang} == zh-CN ]; then
+    tgt_nbpe=4000
+fi
+
+./st.sh \
+    --ngpu 1 \
+    --local_data_opts "--stage 0 --src_lang ${src_lang} --tgt_lang ${tgt_lang}" \
+    --speed_perturb_factors "${speed_perturb_factors}" \
+    --use_lm false \
+    --feats_type raw \
+    --audio_format "flac.ark" \
+    --token_joint false \
+    --src_lang ${src_lang} \
+    --tgt_lang ${tgt_lang} \
+    --src_token_type "bpe" \
+    --src_nbpe $src_nbpe \
+    --tgt_token_type "bpe" \
+    --tgt_nbpe $tgt_nbpe \
+    --src_case ${src_case} \
+    --tgt_case ${tgt_case} \
+    --st_config "${st_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${train_dev}" \
+    --test_sets "${test_set}" \
+    --src_bpe_train_text "data/${train_set}/text.${src_case}.${src_lang}" \
+    --tgt_bpe_train_text "data/${train_set}/text.${tgt_case}.${tgt_lang}" \
+    --lm_train_text "data/${train_set}/text.${tgt_case}.${tgt_lang}"  "$@"
diff --git a/egs2/covost2/st1/scripts b/egs2/covost2/st1/scripts
new file mode 120000
index 00000000000..c5b75bb5b0a
--- /dev/null
+++ b/egs2/covost2/st1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/st1/scripts
\ No newline at end of file
diff --git a/egs2/covost2/st1/st.sh b/egs2/covost2/st1/st.sh
new file mode 120000
index 00000000000..5c7465739e3
--- /dev/null
+++ b/egs2/covost2/st1/st.sh
@@ -0,0 +1 @@
+../../TEMPLATE/st1/st.sh
\ No newline at end of file
diff --git a/egs2/covost2/st1/steps b/egs2/covost2/st1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/covost2/st1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/covost2/st1/utils b/egs2/covost2/st1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/covost2/st1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/dns_icassp21/enh1/README.md b/egs2/dns_icassp21/enh1/README.md
new file mode 100644
index 00000000000..c10aa6ed255
--- /dev/null
+++ b/egs2/dns_icassp21/enh1/README.md
@@ -0,0 +1,22 @@
+<!-- Generated by ./scripts/utils/show_enh_score.sh -->
+# RESULTS
+## Environments
+- date: `Thu Apr 21 21:49:46 UTC 2022`
+- python version: `3.7.4 (default, Aug 13 2019, 20:35:49)  [GCC 7.3.0]`
+- espnet version: `espnet 202204`
+- pytorch version: `pytorch 1.10.1+cu111`
+- Git hash: `a8e5d3d0ef346728a1be74166d4030370b7386cf`
+  - Commit date: `Sun Apr 24 04:35:33 2022 +0900`
+
+
+## enh_train_enh_tcn_tf_raw
+
+- config: ./conf/tuning/train_enh_tcn_tf.yaml
+- Pretrained model: https://huggingface.co/espnet/dns_icassp21_enh_train_enh_tcn_tf_raw
+
+
+|dataset|STOI|SDR|SI_SNR|
+|---|---|---|---|
+|enhanced_cv_synthetic|0.93|18.96|18.79|
+|enhanced_tt_synthetic_track_1|0.77|14.19|12.15|
+
diff --git a/egs2/dns_icassp21/enh1/cmd.sh b/egs2/dns_icassp21/enh1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/dns_icassp21/enh1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/dns_icassp21/enh1/conf/pbs.conf b/egs2/dns_icassp21/enh1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/dns_icassp21/enh1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/dns_icassp21/enh1/conf/queue.conf b/egs2/dns_icassp21/enh1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/dns_icassp21/enh1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/dns_icassp21/enh1/conf/slurm.conf b/egs2/dns_icassp21/enh1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/dns_icassp21/enh1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/dns_icassp21/enh1/conf/train.yaml b/egs2/dns_icassp21/enh1/conf/train.yaml
new file mode 120000
index 00000000000..2b7a018712d
--- /dev/null
+++ b/egs2/dns_icassp21/enh1/conf/train.yaml
@@ -0,0 +1 @@
+tuning/train_enh_tcn_tf.yaml
\ No newline at end of file
diff --git a/egs2/dns_icassp21/enh1/conf/tuning/train_enh_tcn_tf.yaml b/egs2/dns_icassp21/enh1/conf/tuning/train_enh_tcn_tf.yaml
new file mode 100644
index 00000000000..aedfd53b710
--- /dev/null
+++ b/egs2/dns_icassp21/enh1/conf/tuning/train_enh_tcn_tf.yaml
@@ -0,0 +1,57 @@
+optim: adam
+init: xavier_uniform
+max_epoch: 100
+batch_type: folded
+batch_size: 128
+num_workers: 4
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 1.0e-7
+patience: 10
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: reducelronplateau
+scheduler_conf:
+    mode: min
+    factor: 0.7
+    patience: 1
+
+encoder: stft
+encoder_conf:
+    n_fft: 320
+    hop_length: 160
+decoder: stft
+decoder_conf:
+    n_fft: 320
+    hop_length: 160
+separator: tcn
+separator_conf:
+    num_spk: 1
+    layer: 4
+    stack: 3
+    bottleneck_dim: 128
+    hidden_dim: 512
+    kernel: 3
+    causal: True
+    norm_type: "gLN"
+    nonlinear: "relu"
+
+criterions: 
+  # The first criterion
+  - name: si_snr 
+    conf:
+      eps: 1.0e-7
+    wrapper: pit
+    wrapper_conf:
+      weight: 1.0
+      independent_perm: True
\ No newline at end of file
diff --git a/egs2/dns_icassp21/enh1/db.sh b/egs2/dns_icassp21/enh1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/dns_icassp21/enh1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/dns_icassp21/enh1/enh.sh b/egs2/dns_icassp21/enh1/enh.sh
new file mode 120000
index 00000000000..8fd33b0b191
--- /dev/null
+++ b/egs2/dns_icassp21/enh1/enh.sh
@@ -0,0 +1 @@
+../../TEMPLATE/enh1/enh.sh
\ No newline at end of file
diff --git a/egs2/dns_icassp21/enh1/local/data.sh b/egs2/dns_icassp21/enh1/local/data.sh
new file mode 100755
index 00000000000..e2fa6f5e133
--- /dev/null
+++ b/egs2/dns_icassp21/enh1/local/data.sh
@@ -0,0 +1,60 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+help_message=$(cat << EOF
+Usage: $0 [--stage <stage>] [--stop_stage <stop_stage>]
+  optional argument:
+    [--stage]: 1 (default) or 2
+    [--stop_stage]: 1 or 2 (default)
+    NOTE:
+        stage 1: Create the Data Mixture from the DNS scripts. You can skip this step when you already have the audio mixture for training.
+        stage 2: Prepare the data for ESPNet-se
+        You can get scripts by git clone -b icassp2021-final https://github.com/microsoft/DNS-Challenge.git DNS-Challenge
+        You can download the data by using download-dns-challenge-2.sh in the master branch without git lfs
+        In addition, "datasets/wideband/acoustic_params_wideband" and "datasets/wideband/dev_testset_wideband/track1" are required, which are not downloaded by the above script
+        For evaluation, synthetic data in the "datasets/wideband/dev_testset_wideband/track1" in the the interspeech2021/adddata branch is used 
+        To avoid issues related to hard-coded paths, please change the current directory to DNS-Challenge in noisyspeech_synthesizer_singleprocess.py
+        Also, please make sure the destination is under data/dns_wav
+EOF
+)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+stage=1
+stop_stage=2
+dns_wav=$PWD/data/dns_wav
+
+
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    echo "${help_message}"
+    exit 1;
+fi
+
+if [ ! -e "${DNS2}" ]; then
+    log "Fill the value of 'DNS2' of db.sh"
+    exit 1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: Data Simulation"
+    local/dns_create_mixture.sh ${DNS2} ${dns_wav}  || exit 1;
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: Data preparation"
+    # The following datasets will be created:
+    # {tr,cv}_synthetic tt_synthetic_track_1
+    local/dns_data_prep.sh  ${dns_wav} ${DNS2}/datasets/dev_testset_wideband || exit 1;
+fi
\ No newline at end of file
diff --git a/egs2/dns_icassp21/enh1/local/dns_create_mixture.sh b/egs2/dns_icassp21/enh1/local/dns_create_mixture.sh
new file mode 100755
index 00000000000..74f56b543c2
--- /dev/null
+++ b/egs2/dns_icassp21/enh1/local/dns_create_mixture.sh
@@ -0,0 +1,75 @@
+#!/usr/bin/env bash
+
+. utils/parse_options.sh
+. path.sh
+. cmd.sh
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 <dns> <dns_wav> "
+  echo " where <dns> is dns directory,"
+  echo " <dns_wav> is wav generation space."
+  exit 1;
+fi
+
+dns=$1
+dns_wav=$2
+
+rm -r data/ 2>/dev/null || true
+mkdir -p data/
+
+# modify path in the original noisyspeech_synthesizer.cfg
+configure=${dns}/noisyspeech_synthesizer.cfg
+train_cfg=data/noisyspeech_synthesizer.cfg
+
+
+if [ ! -f ${configure} ]; then
+  echo -e "Please check configurtion ${configure} exist"
+  exit 1;
+fi
+
+# input datas
+noise_dir=${dns}/datasets/noise
+speech_dir=${dns}/datasets/clean/read_speech
+
+# additional clean datas
+clean_singing=${dns}/datasets/clean/singing_voice
+clean_emotion=${dns}/datasets/clean/emotional_speech
+clean_mandarin=${dns}/datasets/clean/mandarin_speech
+
+# acoustic params
+rir_table_csv=${dns}/datasets/acoustic_params_wideband/RIR_table_simple.csv
+clean_speech_t60_csv=${dns}/datasets/acoustic_params_wideband/cleanspeech_table_t60_c50.csv
+
+# outputs
+noisy_wav=${dns_wav}/noisy
+clean_wav=${dns_wav}/clean
+noise_wav=${dns_wav}/noise
+log_dir=log
+
+# modify the input paths for "\" separated paths
+sed -e "/^noisy_destination/s#.*#noisy_destination:${noisy_wav}#g"  \
+    -e "/^clean_destination/s#.*#clean_destination:${clean_wav}#g"  \
+    -e "/^noise_destination/s#.*#noise_destination:${noise_wav}#g"  \
+    -e "/^noise_dir/s#.*#noise_dir:${noise_dir}#g"  \
+    -e "/^speech_dir/s#.*#speech_dir:${speech_dir}#g"  \
+    -e "/^clean_singing/s#.*#clean_singing:${clean_singing}#g"  \
+    -e "/^clean_emotion/s#.*#clean_emotion:${clean_emotion}#g"  \
+    -e "/^clean_mandarin/s#.*#clean_mandarin:${clean_mandarin}#g"  \
+    -e "/^rir_table_csv/s#.*#rir_table_csv:${rir_table_csv}#g"  \
+    -e "/^clean_speech_t60_csv/s#.*#clean_speech_t60_csv:${clean_speech_t60_csv}#g"  \
+    -e "/^log_dir/s#.*#log_dir:${log_dir}#g" ${configure} \
+  > ${train_cfg}
+
+# modify the path separator 
+sed -i -e 's:\\:/:g' ${rir_table_csv}
+
+
+mix_script=${dns}/noisyspeech_synthesizer_singleprocess.py
+
+if [ ! -f ${configure} -a -f ${mix_script} ]; then
+  echo -e "Please check configurtion ${configure} and mix_script ${mix_script} exist"
+  exit 1;
+fi
+
+echo "Creating Mixtures for Training and Validation Data."
+python ${mix_script} --cfg ${PWD}/${train_cfg} >/dev/null || exit 1;
\ No newline at end of file
diff --git a/egs2/dns_icassp21/enh1/local/dns_data_prep.sh b/egs2/dns_icassp21/enh1/local/dns_data_prep.sh
new file mode 100755
index 00000000000..07618a0a2d3
--- /dev/null
+++ b/egs2/dns_icassp21/enh1/local/dns_data_prep.sh
@@ -0,0 +1,158 @@
+#!/usr/bin/env bash
+
+. ./path.sh
+
+
+
+if [ $# -ne 2 ]; then
+  echo "Arguments should be DNS script path, DNS wav path and DNS data, see local/data.sh for example."
+  exit 1;
+fi
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+dns_wav=$1
+dns_test_wav=$2
+
+# check if the wav dirs exist.
+for ddir in clean noise noisy; do
+  f=${dns_wav}/${ddir}
+  if [ ! -d $f ]; then
+    echo "Error: $f is not a directory."
+    exit 1;
+  fi
+done
+
+#Synthetic test data
+for rev in track_1; do
+  for ddir in clean noisy; do
+    f=${dns_test_wav}/${rev}/synthetic/${ddir}
+    if [ ! -d $f ]; then
+      echo "Error: $f is not a directory."
+      exit 1;
+    fi
+  done
+done
+
+#Real_recordings test data
+for rev in track_1; do
+  for ddir in clean noisy; do
+    f=${dns_test_wav}/${rev}/real_recordings
+    if [ ! -d $f ]; then
+      echo "Error: $f is not a directory."
+      exit 1;
+    fi
+  done
+done
+
+data=./data
+rm -r ${data}/{tr, cv}_synthetic 2>/dev/null || true
+rm -r ${data}/tt_synthetic 2>/dev/null || true
+
+tmpdir=data/temp
+rm -r  $tmpdir 2>/dev/null || true
+mkdir -p $tmpdir 
+
+mixwav_dir=${dns_wav}/noisy
+
+find $mixwav_dir -iname '*.wav' | sort -u > $tmpdir/train_valid.flist
+
+sed -e 's:.*_\([0-9]*\).wav$:fileid_\1:i' $tmpdir/train_valid.flist \
+> $tmpdir/train_valid.uttids
+
+paste $tmpdir/train_valid.uttids $tmpdir/train_valid.flist \
+| sort -k1,1 >  $tmpdir/train_valid.scp
+
+num=$(wc -l $tmpdir/train_valid.scp | awk '{print $1}')
+train_num=$(($num*9/10))
+
+echo "Split 10% of the Training data to the Validation data"
+awk "NR<=$train_num" $tmpdir/train_valid.scp > $tmpdir/tr.scp
+awk "NR>$train_num" $tmpdir/train_valid.scp > $tmpdir/cv.scp
+
+for x in tr cv; do
+  ddir=${x}_synthetic
+  mkdir -p ${data}/${ddir}
+  cp $tmpdir/${x}.scp ${data}/${ddir}/wav.scp
+  
+  awk '{print($1, "dummy")}' ${data}/${ddir}/wav.scp | \
+    sort -u> ${data}/${ddir}/utt2spk
+  utt2spk_to_spk2utt.pl ${data}/${ddir}/utt2spk > ${data}/${ddir}/spk2utt
+
+  awk '{print($1, "dummy")}' ${data}/${ddir}/wav.scp | \
+    sort -u> ${data}/${ddir}/text
+
+  noise_wav_dir=${dns_wav}/noise/
+  sed -e "s#${mixwav_dir}.*_\(.*\).wav#${noise_wav_dir}noise_fileid_\1.wav#g" ${data}/${ddir}/wav.scp \
+    > ${data}/${ddir}/noise1.scp
+
+  spk1_wav_dir=${dns_wav}/clean/
+  sed -e "s#${mixwav_dir}.*_\(.*\).wav#${spk1_wav_dir}clean_fileid_\1.wav#g" ${data}/${ddir}/wav.scp \
+    > ${data}/${ddir}/spk1.scp
+done
+
+
+echo "Building testing data"
+
+for x in tt; do
+  echo "Building synthetic testing data"
+  for rev in track_1; do
+    ddir=${x}_synthetic_${rev}
+    mkdir -p ${data}/${ddir}
+    root_dir=${dns_test_wav}/${rev}/synthetic
+
+    mixwav_dir=${root_dir}/noisy/
+    find $mixwav_dir -iname '*.wav' > $tmpdir/${x}_${rev}.flist
+
+    sed -e 's:.*\/\(.*\).wav$:\1:i' $tmpdir/${x}_${rev}.flist \
+    > $tmpdir/${x}_${rev}.uttids    
+
+    paste $tmpdir/${x}_${rev}.uttids $tmpdir/${x}_${rev}.flist \
+    | sort -k1,1 >  ${data}/${ddir}/wav.scp 
+
+
+    awk '{print($1, "dummy")}' ${data}/${ddir}/wav.scp | \
+    sort -u> ${data}/${ddir}/utt2spk
+
+
+    awk '{print($1, "dummy")}' ${data}/${ddir}/wav.scp | \
+    sort -u> ${data}/${ddir}/text
+
+    utt2spk_to_spk2utt.pl ${data}/${ddir}/utt2spk > ${data}/${ddir}/spk2utt
+
+    spk1_wav_dir=${root_dir}/clean/
+    awk '{i=split($1, lst, "_"); spk="'"$spk1_wav_dir"'"lst[1]"_"lst[2]"_clean_fileid_"lst[i]".wav";
+    gsub("_german","",spk); gsub("_french","",spk); gsub("_italian","",spk);
+    print($1, spk)}' ${data}/${ddir}/wav.scp \
+    > ${data}/${ddir}/spk1.scp
+  done
+
+  echo "Building real testing data"
+  ddir=${x}_real_recordings
+  mkdir -p ${data}/${ddir}
+  real_dir=${dns_test_wav}/track_1/real_recordings
+
+  find $real_dir -iname '*.wav' > $tmpdir/${x}_real_recordings.flist
+  
+  sed -e 's:.*\/\(.*\).wav$:\1:i' $tmpdir/${x}_real_recordings.flist \
+  > $tmpdir/${x}_real_recordings.uttids
+
+  paste $tmpdir/${x}_real_recordings.uttids $tmpdir/${x}_real_recordings.flist \
+  | sort -k1,1 >  ${data}/${ddir}/wav.scp 
+
+
+  awk '{print($1, "dummy")}' ${data}/${ddir}/wav.scp | \
+  sort -u> ${data}/${ddir}/utt2spk
+
+
+  awk '{print($1, "dummy")}' ${data}/${ddir}/wav.scp | \
+  sort -u> ${data}/${ddir}/text
+
+  utt2spk_to_spk2utt.pl ${data}/${ddir}/utt2spk > ${data}/${ddir}/spk2utt
+
+
+done
\ No newline at end of file
diff --git a/egs2/dns_icassp21/enh1/local/path.sh b/egs2/dns_icassp21/enh1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/dns_icassp21/enh1/path.sh b/egs2/dns_icassp21/enh1/path.sh
new file mode 120000
index 00000000000..eb217d35673
--- /dev/null
+++ b/egs2/dns_icassp21/enh1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/enh1/path.sh
\ No newline at end of file
diff --git a/egs2/dns_icassp21/enh1/pyscripts b/egs2/dns_icassp21/enh1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/dns_icassp21/enh1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/dns_icassp21/enh1/run.sh b/egs2/dns_icassp21/enh1/run.sh
new file mode 100755
index 00000000000..b2dda44031e
--- /dev/null
+++ b/egs2/dns_icassp21/enh1/run.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+sample_rate=16k
+
+
+train_set=tr_synthetic
+valid_set=cv_synthetic
+test_sets="tt_synthetic_track_1"
+
+./enh.sh \
+    --lang en \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --fs ${sample_rate} \
+    --ngpu 1 \
+    --spk_num 1 \
+    --local_data_opts "" \
+    --enh_config ./conf/train.yaml \
+    --use_dereverb_ref false \
+    --use_noise_ref false \
+    --max_wav_duration 31 \
+    --inference_model "valid.loss.best.pth" \
+    "$@"
diff --git a/egs2/dns_icassp21/enh1/scripts b/egs2/dns_icassp21/enh1/scripts
new file mode 120000
index 00000000000..9aeb3f26509
--- /dev/null
+++ b/egs2/dns_icassp21/enh1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/enh1/scripts
\ No newline at end of file
diff --git a/egs2/dns_icassp21/enh1/steps b/egs2/dns_icassp21/enh1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/dns_icassp21/enh1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/dns_icassp21/enh1/utils b/egs2/dns_icassp21/enh1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/dns_icassp21/enh1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/dns_ins20/enh1/README.md b/egs2/dns_ins20/enh1/README.md
index ac790f4b5f8..d64724bfd65 100644
--- a/egs2/dns_ins20/enh1/README.md
+++ b/egs2/dns_ins20/enh1/README.md
@@ -14,9 +14,57 @@
  - config: ./conf/tuning/train_enh_blstm_tf.yaml
  - Pretrained model: https://zenodo.org/record/4923697
 
-|dataset|STOI|SAR|SDR|SIR|
-|---|---|---|---|---|
-|enhanced_cv_synthetic|0.95|18.63|18.63|0.00|
-|enhanced_tt_synthetic_no_reverb|0.92|10.92|10.92|0.00|
-|enhanced_tt_synthetic_with_reverb|0.85|9.31|9.31|0.00|
+| dataset                           | STOI | SAR   | SDR   | SIR  |
+| --------------------------------- | ---- | ----- | ----- | ---- |
+| enhanced_cv_synthetic             | 0.95 | 18.63 | 18.63 | 0.00 |
+| enhanced_tt_synthetic_no_reverb   | 0.92 | 10.92 | 10.92 | 0.00 |
+| enhanced_tt_synthetic_with_reverb | 0.85 | 9.31  | 9.31  | 0.00 |
 
+<!-- Generated by ./scripts/utils/show_enh_score.sh -->
+# RESULTS
+## Environments
+- date: `Thu Feb 10 23:11:40 CST 2022`
+- python version: `3.8.12 (default, Oct 12 2021, 13:49:34)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.5a1`
+- pytorch version: `pytorch 1.9.1`
+- Git hash: `6f66283b9eed7b0d5e5643feb18d8f60118a4afc`
+  - Commit date: `Mon Dec 13 15:30:29 2021 +0800`
+
+
+## enh_train_enh_dccrn_raw
+
+- config: ./conf/tuning/train_enh_dccrn.yaml
+- download_model: https://huggingface.co/Johnson-Lsx/Shaoxiong_Lin_dns_ins20_enh_enh_train_enh_dccrn_raw
+
+| dataset                           | PESQ | STOI | SAR   | SDR   | SIR  | SI_SNR |
+| --------------------------------- | ---- | ---- | ----- | ----- | ---- | ------ |
+| enhanced_cv_synthetic             | 3.72 | 0.98 | 24.69 | 24.69 | 0.00 | 24.22  |
+| enhanced_tt_synthetic_no_reverb   | 3.29 | 0.96 | 17.69 | 17.69 | 0.00 | 17.50  |
+| enhanced_tt_synthetic_with_reverb | 2.54 | 0.81 | 10.45 | 10.45 | 0.00 | 9.72   |
+
+Note: Here, the model is only trained on data without reverberation.
+Note: Here, the PESQ score is calculated based on https://github.com/vBaiCai/python-pesq.
+
+
+<!-- Generated by ./scripts/utils/show_enh_score.sh -->
+# RESULTS
+## Environments
+- date: `Sun Apr 24 23:59:01 EDT 2022`
+- python version: `3.8.12 (default, Oct 12 2021, 13:49:34)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.6a1`
+- pytorch version: `pytorch 1.10.2+cu102`
+- Git hash: `21c02c8f578b9860e6bf38c86a0bd7cd0412c7f8`
+  - Commit date: `Sun Feb 6 15:37:51 2022 -0500`
+
+
+## enh_train_enh_conv_tasnet_raw
+
+- config: ./conf/tuning/train_enh_conv_tasnet.yaml
+- model: https://huggingface.co/muqiaoy/muqiaoy_dns_ins20_enh_train_enh_conv_tasnet_raw
+
+
+| dataset                           | STOI | SAR   | SDR   | SI_SNR  |
+| --------------------------------- | ---- | ----- | ----- | ---- |
+| enhanced_cv_synthetic             | 0.97 | 24.52 | 24.52 | 24.43 |
+| enhanced_tt_synthetic_no_reverb   | 0.96 | 17.66 | 17.66 | 17.69 |
+| enhanced_tt_synthetic_with_reverb | 0.84 | 11.84 | 11.84 | 11.15 |
diff --git a/egs2/dns_ins20/enh1/conf/tuning/train_enh_conv_tasnet.yaml b/egs2/dns_ins20/enh1/conf/tuning/train_enh_conv_tasnet.yaml
new file mode 100644
index 00000000000..266c711a641
--- /dev/null
+++ b/egs2/dns_ins20/enh1/conf/tuning/train_enh_conv_tasnet.yaml
@@ -0,0 +1,60 @@
+optim: adam
+init: xavier_uniform
+max_epoch: 100
+batch_type: folded
+batch_size: 16 # batch_size 16 can be trained on 4 RTX 2080ti
+iterator_type: chunk
+chunk_length: 40000
+num_workers: 4
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 0
+patience: 4
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: reducelronplateau
+scheduler_conf:
+    mode: min
+    factor: 0.5
+    patience: 1
+encoder: conv
+encoder_conf:
+    channel: 256
+    kernel_size: 20
+    stride: 10
+decoder: conv
+decoder_conf:
+    channel: 256
+    kernel_size: 20
+    stride: 10
+separator: tcn
+separator_conf:
+    num_spk: 1
+    layer: 8
+    stack: 4
+    bottleneck_dim: 256
+    hidden_dim: 512
+    kernel: 3
+    causal: False
+    norm_type: "gLN"
+    nonlinear: relu
+
+criterions: 
+  # The first criterion
+  - name: si_snr 
+    conf:
+      eps: 1.0e-7
+    wrapper: pit
+    wrapper_conf:
+      weight: 1.0
+      independent_perm: True
diff --git a/egs2/dns_ins20/enh1/conf/tuning/train_enh_dccrn.yaml b/egs2/dns_ins20/enh1/conf/tuning/train_enh_dccrn.yaml
new file mode 100644
index 00000000000..695e04f8cc0
--- /dev/null
+++ b/egs2/dns_ins20/enh1/conf/tuning/train_enh_dccrn.yaml
@@ -0,0 +1,53 @@
+optim: adam
+init: null   # do not set init method here because DCCRN has its own initialization
+max_epoch: 100
+batch_type: folded
+batch_size: 32
+iterator_type: chunk
+chunk_length: 64000
+num_workers: 4
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 1.0e-7
+patience: 10
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: reducelronplateau
+scheduler_conf:
+    mode: min
+    factor: 0.7
+    patience: 1
+model_conf:
+    loss_type: si_snr
+encoder: stft
+encoder_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 100
+decoder: stft
+decoder_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 100
+separator: dccrn
+
+criterions: 
+  # The first criterion
+  - name: si_snr 
+    conf:
+      eps: 1.0e-7
+    # the wrapper for the current criterion
+    # for single-talker case, we simplely use fixed_order wrapper
+    wrapper: fixed_order
+    wrapper_conf:
+      weight: 1.0
diff --git a/egs2/dns_ins21/enh1/cmd.sh b/egs2/dns_ins21/enh1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/dns_ins21/enh1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/dns_ins21/enh1/conf/pbs.conf b/egs2/dns_ins21/enh1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/dns_ins21/enh1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/dns_ins21/enh1/conf/queue.conf b/egs2/dns_ins21/enh1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/dns_ins21/enh1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/dns_ins21/enh1/conf/slurm.conf b/egs2/dns_ins21/enh1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/dns_ins21/enh1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/dns_ins21/enh1/conf/train.yaml b/egs2/dns_ins21/enh1/conf/train.yaml
new file mode 120000
index 00000000000..2b7a018712d
--- /dev/null
+++ b/egs2/dns_ins21/enh1/conf/train.yaml
@@ -0,0 +1 @@
+tuning/train_enh_tcn_tf.yaml
\ No newline at end of file
diff --git a/egs2/dns_ins21/enh1/conf/tuning/train_enh_tcn_tf.yaml b/egs2/dns_ins21/enh1/conf/tuning/train_enh_tcn_tf.yaml
new file mode 100644
index 00000000000..bb1a8962cff
--- /dev/null
+++ b/egs2/dns_ins21/enh1/conf/tuning/train_enh_tcn_tf.yaml
@@ -0,0 +1,56 @@
+optim: adam
+init: xavier_uniform
+max_epoch: 100
+batch_type: folded
+batch_size:  64
+num_workers: 4
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 1.0e-7
+patience: 10
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: reducelronplateau
+scheduler_conf:
+    mode: min
+    factor: 0.7
+    patience: 1
+
+encoder: stft
+encoder_conf:
+    n_fft: 320
+    hop_length: 160
+decoder: stft
+decoder_conf:
+    n_fft: 320
+    hop_length: 160
+separator: tcn
+separator_conf:
+    num_spk: 1
+    layer: 8
+    stack: 3
+    bottleneck_dim: 128
+    hidden_dim: 512
+    kernel: 3
+    causal: True
+    norm_type: "gLN"
+    nonlinear: "relu"
+
+criterions: 
+  # The first criterion
+  - name: si_snr 
+    conf:
+      eps: 1.0e-7
+    wrapper: fixed_order
+    wrapper_conf:
+      weight: 1.0
\ No newline at end of file
diff --git a/egs2/dns_ins21/enh1/db.sh b/egs2/dns_ins21/enh1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/dns_ins21/enh1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/dns_ins21/enh1/enh.sh b/egs2/dns_ins21/enh1/enh.sh
new file mode 120000
index 00000000000..8fd33b0b191
--- /dev/null
+++ b/egs2/dns_ins21/enh1/enh.sh
@@ -0,0 +1 @@
+../../TEMPLATE/enh1/enh.sh
\ No newline at end of file
diff --git a/egs2/dns_ins21/enh1/local/data.sh b/egs2/dns_ins21/enh1/local/data.sh
new file mode 100755
index 00000000000..229ab7ecc5a
--- /dev/null
+++ b/egs2/dns_ins21/enh1/local/data.sh
@@ -0,0 +1,59 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+help_message=$(cat << EOF
+Usage: $0 [--stage <stage>] [--stop_stage <stop_stage>]
+  optional argument:
+    [--stage]: 1 (default) or 2
+    [--stop_stage]: 1 or 2 (default)
+    NOTE:
+        stage 1: Create the Data Mixture from the DNS scripts. You can skip this step when you already have the audio mixture for training.
+        stage 2: Prepare the data for ESPnet-se
+        You can get scripts by git clone -b interspeech2021/adddata https://github.com/microsoft/DNS-Challenge.git DNS-Challenge
+        You can download the data by using download-dns-challenge-3.sh in the master branch without git lfs
+        In addition, "datasets/wideband/acoustic_params_wideband" and "datasets/wideband/dev_testset_wideband/track1" are required, which are not downloaded by the above script
+        To avoid issues related to hard-coded paths, please change the current directory to DNS-Challenge in noisyspeech_synthesizer_singleprocess.py
+        Also, please make sure the destination is under data/dns_wav
+EOF
+)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+stage=1
+stop_stage=2
+dns_wav=$PWD/data/dns_wav
+
+
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    echo "${help_message}"
+    exit 1;
+fi
+
+if [ ! -e "${DNS3}" ]; then
+    log "Fill the value of 'DNS3' of db.sh"
+    exit 1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: Data Simulation"
+    local/dns_create_mixture.sh ${DNS3} ${dns_wav}  || exit 1;
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: Data preparation"
+    # The following datasets will be created:
+    # {tr,cv}_synthetic tt_synthetic_track_1
+    local/dns_data_prep.sh  ${dns_wav} ${DNS3}/datasets/dev_testset_wideband || exit 1;
+fi
\ No newline at end of file
diff --git a/egs2/dns_ins21/enh1/local/dns_create_mixture.sh b/egs2/dns_ins21/enh1/local/dns_create_mixture.sh
new file mode 100755
index 00000000000..573a73b0f9e
--- /dev/null
+++ b/egs2/dns_ins21/enh1/local/dns_create_mixture.sh
@@ -0,0 +1,75 @@
+#!/usr/bin/env bash
+
+. utils/parse_options.sh
+. path.sh
+. cmd.sh
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 <dns> <dns_wav> "
+  echo " where <dns> is dns directory,"
+  echo " <dns_wav> is wav generation space."
+  exit 1;
+fi
+
+dns=$1
+dns_wav=$2
+
+rm -r data/ 2>/dev/null || true
+mkdir -p data/
+
+
+# modify path in the original noisyspeech_synthesizer.cfg
+configure=${dns}/noisyspeech_synthesizer.cfg
+train_cfg=data/noisyspeech_synthesizer.cfg
+
+
+if [ ! -f ${configure} ]; then
+  echo -e "Please check configurtion ${configure} exist"
+  exit 1;
+fi
+
+# input datas
+noise_dir=${dns}/datasets/noise
+speech_dir=${dns}/datasets/clean/read_speech
+
+# additional clean datas
+clean_singing=${dns}/datasets/clean/singing_voice
+clean_emotion=${dns}/datasets/clean/emotional_speech
+clean_mandarin=${dns}/datasets/clean/mandarin_speech
+
+# acoustic params
+rir_table_csv=${dns}/datasets/acoustic_params_wideband/RIR_table_simple.csv
+clean_speech_t60_csv=${dns}/datasets/acoustic_params_wideband/cleanspeech_table_t60_c50.csv
+
+# outputs
+noisy_wav=${dns_wav}/noisy
+clean_wav=${dns_wav}/clean
+noise_wav=${dns_wav}/noise
+log_dir=log
+
+# modify the input paths for "\" separated paths
+sed -e "/^noisy_destination/s#.*#noisy_destination:${noisy_wav}#g"  \
+    -e "/^clean_destination/s#.*#clean_destination:${clean_wav}#g"  \
+    -e "/^noise_destination/s#.*#noise_destination:${noise_wav}#g"  \
+    -e "/^noise_dir/s#.*#noise_dir:${noise_dir}#g"  \
+    -e "/^speech_dir/s#.*#speech_dir:${speech_dir}#g"  \
+    -e "/^clean_singing/s#.*#clean_singing:${clean_singing}#g"  \
+    -e "/^clean_emotion/s#.*#clean_emotion:${clean_emotion}#g"  \
+    -e "/^clean_mandarin/s#.*#clean_mandarin:${clean_mandarin}#g"  \
+    -e "/^rir_table_csv/s#.*#rir_table_csv:${rir_table_csv}#g"  \
+    -e "/^clean_speech_t60_csv/s#.*#clean_speech_t60_csv:${clean_speech_t60_csv}#g"  \
+    -e "/^log_dir/s#.*#log_dir:${log_dir}#g" ${configure} \
+  > ${train_cfg}
+
+# modify the path separator 
+sed -i -e 's:\\:/:g' ${rir_table_csv}
+
+mix_script=${dns}/noisyspeech_synthesizer_singleprocess.py
+
+if [ ! -f ${configure} -a -f ${mix_script} ]; then
+  echo -e "Please check configurtion ${configure} and mix_script ${mix_script} exist"
+  exit 1;
+fi
+
+echo "Creating Mixtures for Training and Validation Data."
+python ${mix_script} --cfg ${PWD}/${train_cfg} >/dev/null || exit 1;
\ No newline at end of file
diff --git a/egs2/dns_ins21/enh1/local/dns_data_prep.sh b/egs2/dns_ins21/enh1/local/dns_data_prep.sh
new file mode 100755
index 00000000000..9ca11a4af1b
--- /dev/null
+++ b/egs2/dns_ins21/enh1/local/dns_data_prep.sh
@@ -0,0 +1,154 @@
+#!/usr/bin/env bash
+
+. ./path.sh
+
+
+
+if [ $# -ne 2 ]; then
+  echo "Arguments should be DNS script path, DNS wav path and DNS data, see local/data.sh for example."
+  exit 1;
+fi
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+dns_wav=$1
+dns_test_wav=$2
+
+# check if the wav dirs exist.
+for ddir in clean noise noisy; do
+  f=${dns_wav}/${ddir}
+  if [ ! -d $f ]; then
+    echo "Error: $f is not a directory."
+    exit 1;
+  fi
+done
+
+#Synthetic test data
+for rev in track_1; do
+  for ddir in clean noisy; do
+    f=${dns_test_wav}/${rev}/synthetic/${ddir}
+    if [ ! -d $f ]; then
+      echo "Error: $f is not a directory."
+      exit 1;
+    fi
+  done
+done
+
+#Real_recordings test data
+f=${dns_test_wav}
+if [ ! -d $f ]; then
+  echo "Error: $f is not a directory."
+  exit 1;
+fi
+
+data=./data
+rm -r ${data}/{tr, cv}_synthetic 2>/dev/null || true
+rm -r ${data}/tt_synthetic 2>/dev/null || true
+
+tmpdir=data/temp
+rm -r  $tmpdir 2>/dev/null || true
+mkdir -p $tmpdir 
+
+mixwav_dir=${dns_wav}/noisy
+
+find $mixwav_dir -iname '*.wav' | sort -u > $tmpdir/train_valid.flist
+
+sed -e 's:.*_\([0-9]*\).wav$:fileid_\1:i' $tmpdir/train_valid.flist \
+> $tmpdir/train_valid.uttids
+
+paste $tmpdir/train_valid.uttids $tmpdir/train_valid.flist \
+| sort -k1,1 >  $tmpdir/train_valid.scp
+
+num=$(wc -l $tmpdir/train_valid.scp | awk '{print $1}')
+train_num=$(($num*9/10))
+
+echo "Split 10% of the Training data to the Validation data"
+awk "NR<=$train_num" $tmpdir/train_valid.scp > $tmpdir/tr.scp
+awk "NR>$train_num" $tmpdir/train_valid.scp > $tmpdir/cv.scp
+
+for x in tr cv; do
+  ddir=${x}_synthetic
+  mkdir -p ${data}/${ddir}
+  cp $tmpdir/${x}.scp ${data}/${ddir}/wav.scp
+  
+  awk '{print($1, "dummy")}' ${data}/${ddir}/wav.scp | \
+    sort -u> ${data}/${ddir}/utt2spk
+  utt2spk_to_spk2utt.pl ${data}/${ddir}/utt2spk > ${data}/${ddir}/spk2utt
+
+  awk '{print($1, "dummy")}' ${data}/${ddir}/wav.scp | \
+    sort -u> ${data}/${ddir}/text
+
+  noise_wav_dir=${dns_wav}/noise/
+  sed -e "s#${mixwav_dir}.*_\(.*\).wav#${noise_wav_dir}noise_fileid_\1.wav#g" ${data}/${ddir}/wav.scp \
+    > ${data}/${ddir}/noise1.scp
+
+  spk1_wav_dir=${dns_wav}/clean/
+  sed -e "s#${mixwav_dir}.*_\(.*\).wav#${spk1_wav_dir}clean_fileid_\1.wav#g" ${data}/${ddir}/wav.scp \
+    > ${data}/${ddir}/spk1.scp
+done
+
+
+echo "Building testing data"
+
+for x in tt; do
+  echo "Building synthetic testing data"
+  for rev in track_1; do
+    ddir=${x}_synthetic_${rev}
+    mkdir -p ${data}/${ddir}
+    root_dir=${dns_test_wav}/${rev}/synthetic
+
+    mixwav_dir=${root_dir}/noisy/
+    find $mixwav_dir -iname '*.wav' > $tmpdir/${x}_${rev}.flist
+
+    sed -e 's:.*\/\(.*\).wav$:\1:i' $tmpdir/${x}_${rev}.flist \
+    > $tmpdir/${x}_${rev}.uttids    
+
+    paste $tmpdir/${x}_${rev}.uttids $tmpdir/${x}_${rev}.flist \
+    | sort -k1,1 >  ${data}/${ddir}/wav.scp 
+
+
+    awk '{print($1, "dummy")}' ${data}/${ddir}/wav.scp | \
+    sort -u> ${data}/${ddir}/utt2spk
+
+
+    awk '{print($1, "dummy")}' ${data}/${ddir}/wav.scp | \
+    sort -u> ${data}/${ddir}/text
+
+    utt2spk_to_spk2utt.pl ${data}/${ddir}/utt2spk > ${data}/${ddir}/spk2utt
+
+    spk1_wav_dir=${root_dir}/clean/
+    awk '{i=split($1, lst, "_"); spk="'"$spk1_wav_dir"'"lst[1]"_"lst[2]"_clean_fileid_"lst[i]".wav";
+    gsub("_german","",spk); gsub("_french","",spk); gsub("_italian","",spk);
+    print($1, spk)}' ${data}/${ddir}/wav.scp \
+    > ${data}/${ddir}/spk1.scp
+  done
+
+  echo "Building real testing data"
+  ddir=${x}_real_recordings
+  mkdir -p ${data}/${ddir}
+  real_dir=${dns_test_wav}
+
+  find $real_dir -maxdepth 1 -name '*.wav' > $tmpdir/${x}_real_recordings.flist
+  
+  sed -e 's:.*\/\(.*\).wav$:\1:i' $tmpdir/${x}_real_recordings.flist \
+  > $tmpdir/${x}_real_recordings.uttids
+
+  paste $tmpdir/${x}_real_recordings.uttids $tmpdir/${x}_real_recordings.flist \
+  | sort -k1,1 >  ${data}/${ddir}/wav.scp 
+
+
+  awk '{print($1, "dummy")}' ${data}/${ddir}/wav.scp | \
+  sort -u> ${data}/${ddir}/utt2spk
+
+
+  awk '{print($1, "dummy")}' ${data}/${ddir}/wav.scp | \
+  sort -u> ${data}/${ddir}/text
+
+  utt2spk_to_spk2utt.pl ${data}/${ddir}/utt2spk > ${data}/${ddir}/spk2utt
+
+
+done
\ No newline at end of file
diff --git a/egs2/dns_ins21/enh1/local/path.sh b/egs2/dns_ins21/enh1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/dns_ins21/enh1/path.sh b/egs2/dns_ins21/enh1/path.sh
new file mode 120000
index 00000000000..eb217d35673
--- /dev/null
+++ b/egs2/dns_ins21/enh1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/enh1/path.sh
\ No newline at end of file
diff --git a/egs2/dns_ins21/enh1/pyscripts b/egs2/dns_ins21/enh1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/dns_ins21/enh1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/dns_ins21/enh1/run.sh b/egs2/dns_ins21/enh1/run.sh
new file mode 100755
index 00000000000..b2dda44031e
--- /dev/null
+++ b/egs2/dns_ins21/enh1/run.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+sample_rate=16k
+
+
+train_set=tr_synthetic
+valid_set=cv_synthetic
+test_sets="tt_synthetic_track_1"
+
+./enh.sh \
+    --lang en \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --fs ${sample_rate} \
+    --ngpu 1 \
+    --spk_num 1 \
+    --local_data_opts "" \
+    --enh_config ./conf/train.yaml \
+    --use_dereverb_ref false \
+    --use_noise_ref false \
+    --max_wav_duration 31 \
+    --inference_model "valid.loss.best.pth" \
+    "$@"
diff --git a/egs2/dns_ins21/enh1/scripts b/egs2/dns_ins21/enh1/scripts
new file mode 120000
index 00000000000..9aeb3f26509
--- /dev/null
+++ b/egs2/dns_ins21/enh1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/enh1/scripts
\ No newline at end of file
diff --git a/egs2/dns_ins21/enh1/steps b/egs2/dns_ins21/enh1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/dns_ins21/enh1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/dns_ins21/enh1/utils b/egs2/dns_ins21/enh1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/dns_ins21/enh1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/dsing/asr1/RESULTS.md b/egs2/dsing/asr1/RESULTS.md
new file mode 100644
index 00000000000..0cdd661e049
--- /dev/null
+++ b/egs2/dsing/asr1/RESULTS.md
@@ -0,0 +1,55 @@
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+# RESULTS
+## Environments
+- date: `Sat Mar 19 23:02:37 EDT 2022`
+- python version: `3.9.7 (default, Sep 16 2021, 13:09:58)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.10.1`
+- Git hash: `c1ed71c6899e54c0b3dad82687886b1183cd0885`
+  - Commit date: `Wed Mar 16 23:34:49 2022 -0400`
+
+## asr_train_asr_conformer7_hubert_ll60k_large_raw_bpe500_sp
+- model:  https://huggingface.co/espnet/ftshijt_espnet2_asr_dsing_hubert_conformer
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_bpe500_valid.loss.ave_asr_model_latest/dev|482|4018|83.6|9.4|7.0|6.4|22.8|58.3|
+|decode_asr_lm_lm_train_lm_bpe500_valid.loss.ave_asr_model_latest/test|480|4632|81.4|12.3|6.3|4.5|23.1|52.1|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_bpe500_valid.loss.ave_asr_model_latest/dev|482|18692|88.5|3.1|8.4|5.9|17.4|58.3|
+|decode_asr_lm_lm_train_lm_bpe500_valid.loss.ave_asr_model_latest/test|480|21787|87.9|4.3|7.8|4.5|16.6|52.1|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_bpe500_valid.loss.ave_asr_model_latest/dev|482|6097|82.2|7.1|10.7|5.7|23.5|58.3|
+|decode_asr_lm_lm_train_lm_bpe500_valid.loss.ave_asr_model_latest/test|480|7736|81.7|9.2|9.1|4.0|22.3|52.1|
+
+## asr_train_asr_raw_bpe500_sp
+- model: https://huggingface.co/espnet/ftshijt_espnet2_asr_dsing_transformer
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_bpe500_valid.loss.ave_asr_model_valid.acc.ave/dev|482|4018|77.0|16.2|6.8|4.0|27.0|65.1|
+|decode_asr_lm_lm_train_lm_bpe500_valid.loss.ave_asr_model_valid.acc.ave/test|480|4632|76.1|17.3|6.6|3.7|27.6|57.7|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_bpe500_valid.loss.ave_asr_model_valid.acc.ave/dev|482|18692|85.0|5.8|9.2|4.2|19.2|65.1|
+|decode_asr_lm_lm_train_lm_bpe500_valid.loss.ave_asr_model_valid.acc.ave/test|480|21787|84.9|6.3|8.8|4.2|19.3|57.7|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_bpe500_valid.loss.ave_asr_model_valid.acc.ave/dev|482|6097|75.2|12.8|12.0|4.1|28.9|65.1|
+|decode_asr_lm_lm_train_lm_bpe500_valid.loss.ave_asr_model_valid.acc.ave/test|480|7736|75.3|14.3|10.4|4.1|28.8|57.7|
\ No newline at end of file
diff --git a/egs2/dsing/asr1/asr.sh b/egs2/dsing/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/dsing/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/dsing/asr1/cmd.sh b/egs2/dsing/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/dsing/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/dsing/asr1/conf/decode_asr.yaml b/egs2/dsing/asr1/conf/decode_asr.yaml
new file mode 120000
index 00000000000..f3f59d5ac2b
--- /dev/null
+++ b/egs2/dsing/asr1/conf/decode_asr.yaml
@@ -0,0 +1 @@
+tuning/decode_transformer.yaml
\ No newline at end of file
diff --git a/egs2/dsing/asr1/conf/fbank.conf b/egs2/dsing/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/dsing/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/dsing/asr1/conf/pbs.conf b/egs2/dsing/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/dsing/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/dsing/asr1/conf/pitch.conf b/egs2/dsing/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/dsing/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/dsing/asr1/conf/queue.conf b/egs2/dsing/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/dsing/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/dsing/asr1/conf/slurm.conf b/egs2/dsing/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/dsing/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/dsing/asr1/conf/train_asr.yaml b/egs2/dsing/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..00ea2f24791
--- /dev/null
+++ b/egs2/dsing/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+tuning/train_asr_transformer.yaml
\ No newline at end of file
diff --git a/egs2/dsing/asr1/conf/train_lm.yaml b/egs2/dsing/asr1/conf/train_lm.yaml
new file mode 100644
index 00000000000..5bc7b7fcbfc
--- /dev/null
+++ b/egs2/dsing/asr1/conf/train_lm.yaml
@@ -0,0 +1,14 @@
+lm_conf:
+    nlayers: 2
+    unit: 650
+optim: sgd        # or adam
+batch_type: folded
+batch_size: 64   # batch size in LM training
+max_epoch: 20      # if the data size is large, we can reduce this
+patience: 3
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
diff --git a/egs2/dsing/asr1/conf/tuning/decode_conformer.yaml b/egs2/dsing/asr1/conf/tuning/decode_conformer.yaml
new file mode 100644
index 00000000000..d89db079882
--- /dev/null
+++ b/egs2/dsing/asr1/conf/tuning/decode_conformer.yaml
@@ -0,0 +1,7 @@
+batch_size: 1
+beam_size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.5
+lm_weight: 0.3
diff --git a/egs2/dsing/asr1/conf/tuning/decode_rnn.yaml b/egs2/dsing/asr1/conf/tuning/decode_rnn.yaml
new file mode 100644
index 00000000000..169f05a1ad9
--- /dev/null
+++ b/egs2/dsing/asr1/conf/tuning/decode_rnn.yaml
@@ -0,0 +1,6 @@
+lm_weight: 0.3
+beam_size: 20
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.6
diff --git a/egs2/dsing/asr1/conf/tuning/decode_transformer.yaml b/egs2/dsing/asr1/conf/tuning/decode_transformer.yaml
new file mode 100644
index 00000000000..d89db079882
--- /dev/null
+++ b/egs2/dsing/asr1/conf/tuning/decode_transformer.yaml
@@ -0,0 +1,7 @@
+batch_size: 1
+beam_size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.5
+lm_weight: 0.3
diff --git a/egs2/dsing/asr1/conf/tuning/train_asr_conformer6.yaml b/egs2/dsing/asr1/conf/tuning/train_asr_conformer6.yaml
new file mode 100644
index 00000000000..4ec26c01907
--- /dev/null
+++ b/egs2/dsing/asr1/conf/tuning/train_asr_conformer6.yaml
@@ -0,0 +1,79 @@
+# network architecture
+
+# frontend related
+frontend: default
+frontend_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 160
+
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: embed
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# optimization related
+optim: adam
+accum_grad: 1
+grad_clip: 3
+max_epoch: 50
+optim_conf:
+    lr: 4.0
+scheduler: noamlr
+scheduler_conf:
+    model_size: 256
+    warmup_steps: 25000
+
+# minibatch related
+batch_type: numel
+batch_bins: 10000000
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/dsing/asr1/conf/tuning/train_asr_rnn.yaml b/egs2/dsing/asr1/conf/tuning/train_asr_rnn.yaml
new file mode 100644
index 00000000000..f55ad9d72bd
--- /dev/null
+++ b/egs2/dsing/asr1/conf/tuning/train_asr_rnn.yaml
@@ -0,0 +1,35 @@
+# network architecture
+# encoder related
+encoder: vgg_rnn
+encoder_conf:
+    rnn_type: lstm     # encoder architecture type
+    bidirectional: True
+    use_projection: True
+    num_layers: 4
+    hidden_size: 1024
+    output_size: 1024
+
+# decoder related
+decoder: rnn
+decoder_conf:
+    num_layers: 2
+    hidden_size: 1024
+    sampling_probability: 0
+    att_conf:
+        atype: location
+        adim: 1024
+        aconv_chans: 10
+        aconv_filts: 100
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.5
+
+# minibatch related
+batch_size: 30
+
+# optimization related
+optim: adadelta
+max_epoch: 15
+patience: 3
+
diff --git a/egs2/dsing/asr1/conf/tuning/train_asr_transformer.yaml b/egs2/dsing/asr1/conf/tuning/train_asr_transformer.yaml
new file mode 100644
index 00000000000..545cd8a8333
--- /dev/null
+++ b/egs2/dsing/asr1/conf/tuning/train_asr_transformer.yaml
@@ -0,0 +1,51 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: "conv2d"
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+
+# decoder related
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: "embed"
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# minibatch related
+batch_type: folded
+batch_size: 32
+
+# optimization related
+optim: adam
+accum_grad: 2
+grad_clip: 5
+patience: 15
+max_epoch: 100
+optim_conf:
+    lr: 1.0
+scheduler: noamlr
+scheduler_conf:
+    warmup_steps: 25000
+
+# others:
+best_model_criterion:
+-  - valid
+   - acc
+   - max
+keep_nbest_models: 10
+
+init: xavier_uniform
diff --git a/egs2/dsing/asr1/db.sh b/egs2/dsing/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/dsing/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/dsing/asr1/local/data.sh b/egs2/dsing/asr1/local/data.sh
new file mode 100644
index 00000000000..26c61801e5f
--- /dev/null
+++ b/egs2/dsing/asr1/local/data.sh
@@ -0,0 +1,65 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Carnegie Mellon  University (Jiatong Shi)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+# general configuration
+stage=0       # start from 0 if you need to start from data preparation
+stop_stage=100
+SECONDS=0
+dsing=1
+
+ . utils/parse_options.sh || exit 1;
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+mkdir -p ${DSING}
+if [ -z "${DSING}" ]; then
+    log "Fill the value of 'DSING' of db.sh"
+    exit 1
+fi
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train${dsing}
+train_dev=dev
+test_set="dev test"
+
+
+log "data preparation started"
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "Prepare stage1: Download data to ${DSING}"
+    echo "Please download the data at https://ccrma.stanford.edu/damp/"
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "Prepare stage2: segmentation setup for Dsing"
+    if [ -d "local/dsing_task" ]; then
+       echo "exist segmetation, skip git clone"
+    else
+        git clone https://github.com/groadabike/Kaldi-Dsing-task.git local/dsing_task
+    fi
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "Prepare stage3: dataset prepare"
+    for datadir in ${train_set} ${train_dev} ${test_set}; do
+        python local/data_prep.py data/ ${DSING}/sing_300x30x2 local/dsing_task/DSing\ Kaldi\ Recipe/dsing/s5/conf/${datadir}.json ${datadir}
+        utils/utt2spk_to_spk2utt.pl data/${datadir}/utt2spk > data/${datadir}/spk2utt
+        utils/fix_data_dir.sh data/${datadir}
+    done
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/dsing/asr1/local/data_prep.py b/egs2/dsing/asr1/local/data_prep.py
new file mode 100644
index 00000000000..98d82fe1259
--- /dev/null
+++ b/egs2/dsing/asr1/local/data_prep.py
@@ -0,0 +1,195 @@
+# Source from https://github.com/groadabike/Kaldi-Dsing-task
+
+import json
+import argparse
+from os.path import join, exists, isfile
+from os import makedirs, listdir
+import re
+import hashlib
+
+
+class DataSet:
+    def __init__(self, name, workspace, db_path):
+        self.segments = []
+        self.spk2gender = []
+        self.text = []
+        self.utt2spk = []
+        self.wavscp = []
+        self.workspace = join(workspace, name)
+        self.db_path = db_path
+
+    def add_utterance(self, utt, recording):
+
+        text = utt["text"]
+        arrangement, performance, country, gender, user = recording[:-4].split("-")
+
+        # the following mapping is necessary for errors in gender in country IN
+        insensitive_none = re.compile(re.escape("none"), re.IGNORECASE)
+
+        gender = insensitive_none.sub("", utt["gender"])
+        spk = "{}{}".format(
+            insensitive_none.sub("", gender).upper(), insensitive_none.sub("", user)
+        )
+
+        rec_id = recording[:-4]
+        utt_id = "{}-{}-{}-{}-{}-{:03}".format(
+            spk, arrangement, performance, country, gender.upper(), utt["index"]
+        )
+
+        start = utt["start"]
+        end = utt["end"]
+
+        wavpath = join(country, "{}{}".format(country, "Vocals"), recording)
+
+        self._add_segment(utt_id, rec_id, start, end)
+        self._add_spk2gender(spk, gender)
+        self._add_text(utt_id, text)
+        self._add_utt2spk(utt_id, spk)
+        self._add_wavscp(rec_id, wavpath)
+
+    def _add_segment(self, rec_id, utt_id, start, end):
+        self.segments.append("{} {} {:.3f} {:.3f}".format(rec_id, utt_id, start, end))
+
+    def _add_spk2gender(self, spk, gender):
+        self.spk2gender.append("{} {}".format(spk, gender))
+
+    def _add_text(self, utt_id, text):
+        self.text.append("{} {}".format(utt_id, text))
+
+    def _add_utt2spk(self, utt_id, spk):
+        self.utt2spk.append("{} {}".format(utt_id, spk))
+
+    def _add_wavscp(self, rec_id, wavpath):
+        # use ffmpeg or sox (default ffmepg)
+        self.wavscp.append(
+            "{} ffmpeg -i {}/{} -f wav -ar 16000 -ac 1 - | ".format(
+                rec_id, self.db_path, wavpath
+            )
+        )
+        # self.wavscp.append(
+        #     "{} sox {}/{} -G -t wav -r 16000 -c 1 - remix 1 | ".format(
+        #         rec_id, db_path, wavpath
+        #     )
+        #  )
+
+    def list2file(self, outfile, list_data):
+        list_data = list(set(list_data))
+        with open(outfile, "w") as f:
+            for line in list_data:
+                f.write("{}\n".format(line))
+
+    def save(self):
+        if not exists(self.workspace):
+            makedirs(self.workspace)
+        self.list2file(join(self.workspace, "spk2gender"), sorted(self.spk2gender))
+        self.list2file(join(self.workspace, "text"), sorted(self.text))
+        self.list2file(join(self.workspace, "wav.scp"), sorted(self.wavscp))
+        self.list2file(join(self.workspace, "utt2spk"), sorted(self.utt2spk))
+        self.list2file(join(self.workspace, "segments"), sorted(self.segments))
+
+
+def read_json(filepath):
+    try:  # Read the json
+        with open(filepath) as data_file:
+            data = json.load(data_file)
+    except json.decoder.JSONDecodeError:
+        # Json has an extra first line. Error when was created
+        data = []
+
+    return data
+
+
+def map_rec2chec(db_path, countries):
+    """
+    Method read all the original audio tracks and create a dict
+            {<checksum>: <recording>}
+    :param db_path: string, path to root of DAMP Sing!
+    :return: dict
+    """
+    rec2chec = {}
+    for country in countries:
+        recordings = [
+            f
+            for f in listdir(join(db_path, country, country + "Vocals"))
+            if f.endswith(".m4a")
+        ]
+        for record in recordings:
+            rec2chec[
+                hashlib.md5(
+                    open(
+                        join(db_path, country, country + "Vocals", record), "rb"
+                    ).read()
+                ).hexdigest()
+            ] = record
+
+    return rec2chec
+
+
+def main(args):
+    db_path = args.db_path
+    workspace = args.workspace
+    utts_path = args.utterances
+    dset = args.dset
+
+    countries = ["GB"]
+    countries += ["US", "AU"] if dset in ["train3", "train30"] else []
+    countries += (
+        [
+            "AE",
+            "AR",
+            "BR",
+            "CL",
+            "CN",
+            "DE",
+            "ES",
+            "FR",
+            "HU",
+            "ID",
+            "IN",
+            "IQ",
+            "IR",
+            "IT",
+            "JP",
+            "KR",
+            "MX",
+            "MY",
+            "NO",
+            "PH",
+            "PT",
+            "RU",
+            "SA",
+            "SG",
+            "TH",
+            "VN",
+            "ZA",
+        ]
+        if dset in ["train30"]
+        else []
+    )
+
+    performances = map_rec2chec(db_path, countries)
+    utterances = read_json(utts_path)
+    dataset = DataSet(dset, workspace, db_path)
+
+    for utt in utterances:
+        dataset.add_utterance(utt, performances[utt["wavfile"]])
+
+    dataset.save()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "workspace", type=str, help="Path where the output files will be saved"
+    )
+    parser.add_argument("db_path", type=str, help="Path to DAMP 300x30x2 database")
+    parser.add_argument(
+        "utterances",
+        type=str,
+        help="Path to utterance details in json format",
+        default="metadata.json",
+    )
+    parser.add_argument("dset", type=str, help="Name of the dataset")
+
+    args = parser.parse_args()
+    main(args)
diff --git a/egs2/dsing/asr1/local/path.sh b/egs2/dsing/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/dsing/asr1/path.sh b/egs2/dsing/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/dsing/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/dsing/asr1/pyscripts b/egs2/dsing/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/dsing/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/dsing/asr1/run.sh b/egs2/dsing/asr1/run.sh
new file mode 100755
index 00000000000..2ce6a7d68bb
--- /dev/null
+++ b/egs2/dsing/asr1/run.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+dsing=30  # Set: 1  for DSing1
+          #    3  for DSing3
+          #    30 for DSing30
+
+train_set=train${dsing}
+train_dev=dev
+test_set="dev test"
+
+
+asr_config=conf/train_asr.yaml
+lm_config=conf/train_lm.yaml
+inference_config=conf/decode_asr.yaml
+
+nbpe=500
+
+./asr.sh \
+    --ngpu 1 \
+    --stage 1 \
+    --stop_stage 100 \
+    --local_data_opts "--dsing ${dsing}" \
+    --use_lm true \
+    --lm_config "${lm_config}" \
+    --token_type bpe \
+    --nbpe $nbpe \
+    --feats_type raw \
+    --speed_perturb_factors "0.9 1.0 1.1" \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${train_dev}" \
+    --test_sets "${test_set}" \
+    --bpe_train_text "data/${train_set}/text" \
+    --lm_train_text "data/${train_set}/text" "$@"
diff --git a/egs2/dsing/asr1/scripts b/egs2/dsing/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/dsing/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/dsing/asr1/steps b/egs2/dsing/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/dsing/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/dsing/asr1/utils b/egs2/dsing/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/dsing/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/asr1/RESULTS.md b/egs2/fisher_callhome_spanish/asr1/RESULTS.md
new file mode 100644
index 00000000000..e8292401a7b
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/RESULTS.md
@@ -0,0 +1,118 @@
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+# RESULTS
+## Environments
+- date: `Fri Feb 25 11:45:29 EST 2022`
+- python version: `3.9.7 (default, Sep 16 2021, 13:09:58)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.9.0`
+- Git hash: `54799d2fa7beb702ab909a7e57cc70288e3ce96c`
+  - Commit date: `Tue Feb 22 10:31:31 2022 -0500`
+
+## asr_8k_conformer (no callhome training)
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/callhome_devtest|3956|37982|64.9|24.8|10.3|6.4|41.5|79.8|
+|decode_asr_asr_model_valid.acc.ave/callhome_evltest|1825|19035|63.1|25.6|11.3|6.4|43.3|82.2|
+|decode_asr_asr_model_valid.acc.ave/fisher_dev|3977|40961|78.5|13.4|8.1|4.8|26.3|65.6|
+|decode_asr_asr_model_valid.acc.ave/fisher_dev2|3958|39871|78.2|14.0|7.8|5.8|27.7|68.1|
+|decode_asr_asr_model_valid.acc.ave/fisher_test|3641|40011|80.0|12.8|7.2|5.8|25.8|64.2|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/callhome_devtest|3956|180997|80.8|7.1|12.1|6.2|25.4|79.8|
+|decode_asr_asr_model_valid.acc.ave/callhome_evltest|1825|91266|79.2|7.6|13.2|5.9|26.8|82.2|
+|decode_asr_asr_model_valid.acc.ave/fisher_dev|3977|194297|88.6|3.4|8.0|5.3|16.7|65.6|
+|decode_asr_asr_model_valid.acc.ave/fisher_dev2|3958|189893|88.4|3.8|7.7|7.0|18.6|68.1|
+|decode_asr_asr_model_valid.acc.ave/fisher_test|3641|194507|89.6|3.2|7.3|5.9|16.3|64.2|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/callhome_devtest|3956|56665|64.3|20.7|15.0|5.6|41.3|79.8|
+|decode_asr_asr_model_valid.acc.ave/callhome_evltest|1825|28386|62.2|21.4|16.3|6.1|43.9|82.2|
+|decode_asr_asr_model_valid.acc.ave/fisher_dev|3977|55856|79.0|11.6|9.4|6.5|27.5|65.6|
+|decode_asr_asr_model_valid.acc.ave/fisher_dev2|3958|53962|79.1|12.5|8.4|8.9|29.8|68.1|
+|decode_asr_asr_model_valid.acc.ave/fisher_test|3641|54138|81.4|10.7|7.9|7.7|26.3|64.2|
+
+## asr_8k_transformer (no callhome training)
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/callhome_devtest|3956|37982|53.1|33.3|13.7|6.0|52.9|85.1|
+|decode_asr_asr_model_valid.acc.ave/callhome_evltest|1825|19035|52.3|34.0|13.7|6.0|53.7|86.7|
+|decode_asr_asr_model_valid.acc.ave/fisher_dev|3977|40961|76.8|16.5|6.7|5.1|28.3|70.0|
+|decode_asr_asr_model_valid.acc.ave/fisher_dev2|3958|39871|77.8|16.3|5.9|6.0|28.2|70.8|
+|decode_asr_asr_model_valid.acc.ave/fisher_test|3641|40011|79.9|14.5|5.5|5.8|25.9|69.2|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/callhome_devtest|3956|180997|74.3|10.0|15.7|6.2|31.9|85.1|
+|decode_asr_asr_model_valid.acc.ave/callhome_evltest|1825|91266|73.3|10.2|16.5|6.1|32.8|86.7|
+|decode_asr_asr_model_valid.acc.ave/fisher_dev|3977|194297|89.7|4.0|6.3|5.6|15.9|70.0|
+|decode_asr_asr_model_valid.acc.ave/fisher_dev2|3958|189893|90.4|4.1|5.5|7.0|16.6|70.8|
+|decode_asr_asr_model_valid.acc.ave/fisher_test|3641|194507|91.6|3.4|5.0|5.9|14.3|69.2|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/callhome_devtest|3956|56665|52.2|28.0|19.8|5.0|52.8|85.1|
+|decode_asr_asr_model_valid.acc.ave/callhome_evltest|1825|28386|50.8|28.9|20.2|5.2|54.3|86.7|
+|decode_asr_asr_model_valid.acc.ave/fisher_dev|3977|55856|76.2|14.3|9.5|5.8|29.6|70.0|
+|decode_asr_asr_model_valid.acc.ave/fisher_dev2|3958|53962|77.5|14.3|8.2|7.9|30.4|70.8|
+|decode_asr_asr_model_valid.acc.ave/fisher_test|3641|54138|80.1|12.3|7.5|6.6|26.5|69.2|
+
+
+## asr_train_asr_raw_bpe1000_sp
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/dev_all|12986|81587|77.8|16.1|6.1|6.0|28.2|62.4|
+|decode_asr_asr_model_valid.acc.ave/test|6283|40307|80.5|14.6|4.9|5.9|25.4|61.4|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/dev_all|12986|392279|89.7|3.9|6.4|5.7|16.0|62.4|
+|decode_asr_asr_model_valid.acc.ave/test|6283|195370|91.8|3.3|4.9|5.6|13.9|61.4|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/dev_all|12986|115994|76.9|13.3|9.7|5.4|28.5|62.4|
+|decode_asr_asr_model_valid.acc.ave/test|6283|55738|80.2|12.0|7.9|5.8|25.6|61.4|
+
+
+
+## asr_train_asr_conformer6_raw_bpe1000_sp
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/dev_all|12986|81587|82.4|12.4|5.2|5.4|23.0|57.5|
+|decode_asr_asr_model_valid.acc.ave/test|6283|40307|85.0|11.0|4.1|5.4|20.5|55.5|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/dev_all|12986|392279|91.6|2.9|5.4|5.3|13.7|57.5|
+|decode_asr_asr_model_valid.acc.ave/test|6283|195370|93.6|2.4|4.0|5.4|11.7|55.5|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/dev_all|12986|115994|81.6|10.1|8.3|5.3|23.7|57.5|
+|decode_asr_asr_model_valid.acc.ave/test|6283|55738|84.9|8.6|6.5|5.7|20.7|55.5|
diff --git a/egs2/fisher_callhome_spanish/asr1/asr.sh b/egs2/fisher_callhome_spanish/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/asr1/cmd.sh b/egs2/fisher_callhome_spanish/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/fisher_callhome_spanish/asr1/conf/decode_asr.yaml b/egs2/fisher_callhome_spanish/asr1/conf/decode_asr.yaml
new file mode 120000
index 00000000000..f3f59d5ac2b
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/conf/decode_asr.yaml
@@ -0,0 +1 @@
+tuning/decode_transformer.yaml
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/asr1/conf/fbank.conf b/egs2/fisher_callhome_spanish/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/fisher_callhome_spanish/asr1/conf/pbs.conf b/egs2/fisher_callhome_spanish/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/fisher_callhome_spanish/asr1/conf/pitch.conf b/egs2/fisher_callhome_spanish/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..926bcfca92a
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=8000
diff --git a/egs2/fisher_callhome_spanish/asr1/conf/queue.conf b/egs2/fisher_callhome_spanish/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/fisher_callhome_spanish/asr1/conf/slurm.conf b/egs2/fisher_callhome_spanish/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/fisher_callhome_spanish/asr1/conf/train_asr.yaml b/egs2/fisher_callhome_spanish/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..00ea2f24791
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+tuning/train_asr_transformer.yaml
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/asr1/conf/train_lm.yaml b/egs2/fisher_callhome_spanish/asr1/conf/train_lm.yaml
new file mode 100644
index 00000000000..5bc7b7fcbfc
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/conf/train_lm.yaml
@@ -0,0 +1,14 @@
+lm_conf:
+    nlayers: 2
+    unit: 650
+optim: sgd        # or adam
+batch_type: folded
+batch_size: 64   # batch size in LM training
+max_epoch: 20      # if the data size is large, we can reduce this
+patience: 3
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
diff --git a/egs2/fisher_callhome_spanish/asr1/conf/tuning/decode_conformer.yaml b/egs2/fisher_callhome_spanish/asr1/conf/tuning/decode_conformer.yaml
new file mode 100644
index 00000000000..d89db079882
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/conf/tuning/decode_conformer.yaml
@@ -0,0 +1,7 @@
+batch_size: 1
+beam_size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.5
+lm_weight: 0.3
diff --git a/egs2/fisher_callhome_spanish/asr1/conf/tuning/decode_rnn.yaml b/egs2/fisher_callhome_spanish/asr1/conf/tuning/decode_rnn.yaml
new file mode 100644
index 00000000000..169f05a1ad9
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/conf/tuning/decode_rnn.yaml
@@ -0,0 +1,6 @@
+lm_weight: 0.3
+beam_size: 20
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.6
diff --git a/egs2/fisher_callhome_spanish/asr1/conf/tuning/decode_transformer.yaml b/egs2/fisher_callhome_spanish/asr1/conf/tuning/decode_transformer.yaml
new file mode 100644
index 00000000000..d89db079882
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/conf/tuning/decode_transformer.yaml
@@ -0,0 +1,7 @@
+batch_size: 1
+beam_size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.5
+lm_weight: 0.3
diff --git a/egs2/fisher_callhome_spanish/asr1/conf/tuning/train_asr_conformer6.yaml b/egs2/fisher_callhome_spanish/asr1/conf/tuning/train_asr_conformer6.yaml
new file mode 100644
index 00000000000..4ec26c01907
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/conf/tuning/train_asr_conformer6.yaml
@@ -0,0 +1,79 @@
+# network architecture
+
+# frontend related
+frontend: default
+frontend_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 160
+
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: embed
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# optimization related
+optim: adam
+accum_grad: 1
+grad_clip: 3
+max_epoch: 50
+optim_conf:
+    lr: 4.0
+scheduler: noamlr
+scheduler_conf:
+    model_size: 256
+    warmup_steps: 25000
+
+# minibatch related
+batch_type: numel
+batch_bins: 10000000
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/fisher_callhome_spanish/asr1/conf/tuning/train_asr_rnn.yaml b/egs2/fisher_callhome_spanish/asr1/conf/tuning/train_asr_rnn.yaml
new file mode 100644
index 00000000000..f55ad9d72bd
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/conf/tuning/train_asr_rnn.yaml
@@ -0,0 +1,35 @@
+# network architecture
+# encoder related
+encoder: vgg_rnn
+encoder_conf:
+    rnn_type: lstm     # encoder architecture type
+    bidirectional: True
+    use_projection: True
+    num_layers: 4
+    hidden_size: 1024
+    output_size: 1024
+
+# decoder related
+decoder: rnn
+decoder_conf:
+    num_layers: 2
+    hidden_size: 1024
+    sampling_probability: 0
+    att_conf:
+        atype: location
+        adim: 1024
+        aconv_chans: 10
+        aconv_filts: 100
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.5
+
+# minibatch related
+batch_size: 30
+
+# optimization related
+optim: adadelta
+max_epoch: 15
+patience: 3
+
diff --git a/egs2/fisher_callhome_spanish/asr1/conf/tuning/train_asr_transformer.yaml b/egs2/fisher_callhome_spanish/asr1/conf/tuning/train_asr_transformer.yaml
new file mode 100644
index 00000000000..545cd8a8333
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/conf/tuning/train_asr_transformer.yaml
@@ -0,0 +1,51 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: "conv2d"
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+
+# decoder related
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: "embed"
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# minibatch related
+batch_type: folded
+batch_size: 32
+
+# optimization related
+optim: adam
+accum_grad: 2
+grad_clip: 5
+patience: 15
+max_epoch: 100
+optim_conf:
+    lr: 1.0
+scheduler: noamlr
+scheduler_conf:
+    warmup_steps: 25000
+
+# others:
+best_model_criterion:
+-  - valid
+   - acc
+   - max
+keep_nbest_models: 10
+
+init: xavier_uniform
diff --git a/egs2/fisher_callhome_spanish/asr1/db.sh b/egs2/fisher_callhome_spanish/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/asr1/local/callhome_create_splits.sh b/egs2/fisher_callhome_spanish/asr1/local/callhome_create_splits.sh
new file mode 120000
index 00000000000..cb162d8e875
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/local/callhome_create_splits.sh
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/callhome_create_splits.sh
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/asr1/local/callhome_data_prep.sh b/egs2/fisher_callhome_spanish/asr1/local/callhome_data_prep.sh
new file mode 120000
index 00000000000..f7c7948603c
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/local/callhome_data_prep.sh
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/callhome_data_prep.sh
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/asr1/local/callhome_make_spk2gender.py b/egs2/fisher_callhome_spanish/asr1/local/callhome_make_spk2gender.py
new file mode 120000
index 00000000000..d465057b745
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/local/callhome_make_spk2gender.py
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/callhome_make_spk2gender.py
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/asr1/local/callhome_make_trans.pl b/egs2/fisher_callhome_spanish/asr1/local/callhome_make_trans.pl
new file mode 120000
index 00000000000..88959b1138f
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/local/callhome_make_trans.pl
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/callhome_make_trans.pl
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/asr1/local/concat_short_utt.py b/egs2/fisher_callhome_spanish/asr1/local/concat_short_utt.py
new file mode 120000
index 00000000000..f0566e73a01
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/local/concat_short_utt.py
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/concat_short_utt.py
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/asr1/local/create_splits.sh b/egs2/fisher_callhome_spanish/asr1/local/create_splits.sh
new file mode 120000
index 00000000000..d2e87a8a129
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/local/create_splits.sh
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/create_splits.sh
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/asr1/local/data.sh b/egs2/fisher_callhome_spanish/asr1/local/data.sh
new file mode 100755
index 00000000000..ebfb413212a
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/local/data.sh
@@ -0,0 +1,79 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Carnegie Mellon University (Jiatong Shi)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+# general configuration
+stage=0       # start from 0 if you need to start from data preparation
+stop_stage=100
+SECONDS=0
+
+. utils/parse_options.sh || exit 1;
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+mkdir -p ${FISHER_CALLHOME_SPANISH}
+if [ -z "${FISHER_CALLHOME_SPANISH}" ]; then
+    log "Fill the value of 'FISHER_CALLHOME_SPANISH' of db.sh"
+    exit 1
+fi
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+# Assume the file structures as
+# - ${FISHER_CALLHOME_SPANISH}
+#     - LDC2010S01 # (for fisher speech)
+#     - LDC2010T04 # (for fisher transcripts)
+#     - LDC96S35   # (for callhome speech)
+#     - LDC96T17   # (for callhome transcripts)
+
+sfisher_speech=${FISHER_CALLHOME_SPANISH}/LDC2010S01
+sfisher_transcripts=${FISHER_CALLHOME_SPANISH}/LDC2010T04
+split=local/splits/split_fisher
+callhome_speech=${FISHER_CALLHOME_SPANISH}/LDC96S35
+callhome_transcripts=${FISHER_CALLHOME_SPANISH}/LDC96T17
+split_callhome=local/splits/split_callhome
+
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "Make sure you have fisher_callhome_spanish at ${sfisher_speech}, ${sfisher_transcripts}, \
+             ${callhome_speech}, ${callhome_transcripts}"
+    log "stage 0: Data Preparation"
+    local/fsp_data_prep.sh ${sfisher_speech} ${sfisher_transcripts}
+    local/callhome_data_prep.sh ${callhome_speech} ${callhome_transcripts}
+
+    # split data
+    local/create_splits.sh ${split}
+    local/callhome_create_splits.sh ${split_callhome}
+
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: Concatenate Multiple Utterances"
+
+    # concatenate multiple utterances
+    local/normalize_trans.sh ${sfisher_transcripts} ${callhome_transcripts}
+
+    # skip combination to have same condition of previous result
+    # utils/combine_data.sh \
+    #     --extra-files "text.lc.es text.lc.rm.es text.tc.es" \
+    #     data/train \
+    #     data/fisher_train data/callhome_train/ 
+
+    cp -r data/fisher_train data/train
+    cp -r data/fisher_dev data/dev
+fi
+
+
+
diff --git a/egs2/fisher_callhome_spanish/asr1/local/fsp_data_prep.sh b/egs2/fisher_callhome_spanish/asr1/local/fsp_data_prep.sh
new file mode 120000
index 00000000000..8b01fc5da76
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/local/fsp_data_prep.sh
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/fsp_data_prep.sh
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/asr1/local/fsp_make_spk2gender.py b/egs2/fisher_callhome_spanish/asr1/local/fsp_make_spk2gender.py
new file mode 120000
index 00000000000..2483eee0bf8
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/local/fsp_make_spk2gender.py
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/fsp_make_spk2gender.py
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/asr1/local/fsp_make_trans.pl b/egs2/fisher_callhome_spanish/asr1/local/fsp_make_trans.pl
new file mode 120000
index 00000000000..67a714c738c
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/local/fsp_make_trans.pl
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/fsp_make_trans.pl
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/asr1/local/normalize_punctuation.pl b/egs2/fisher_callhome_spanish/asr1/local/normalize_punctuation.pl
new file mode 120000
index 00000000000..3235cbb3b86
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/local/normalize_punctuation.pl
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/normalize_punctuation.pl
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/asr1/local/normalize_trans.sh b/egs2/fisher_callhome_spanish/asr1/local/normalize_trans.sh
new file mode 120000
index 00000000000..9f03ca409ff
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/local/normalize_trans.sh
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/normalize_trans.sh
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/asr1/local/path.sh b/egs2/fisher_callhome_spanish/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/fisher_callhome_spanish/asr1/local/splits b/egs2/fisher_callhome_spanish/asr1/local/splits
new file mode 120000
index 00000000000..0156cd08892
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/local/splits
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/splits
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/asr1/path.sh b/egs2/fisher_callhome_spanish/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/asr1/pyscripts b/egs2/fisher_callhome_spanish/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/asr1/run.sh b/egs2/fisher_callhome_spanish/asr1/run.sh
new file mode 100755
index 00000000000..632af2b4b49
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/run.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train
+train_dev=dev
+test_set="fisher_dev fisher_dev2 fisher_test callhome_evltest callhome_devtest"
+
+asr_config=conf/train_asr.yaml
+lm_config=conf/train_lm.yaml
+inference_config=conf/decode_asr.yaml
+
+nbpe=1000
+
+./asr.sh \
+    --ngpu 1 \
+    --fs 8k \
+    --audio_format "flac.ark" \
+    --local_data_opts "--stage 0" \
+    --use_lm false \
+    --lm_config "${lm_config}" \
+    --token_type bpe \
+    --nbpe $nbpe \
+    --feats_type raw \
+    --speed_perturb_factors "0.9 1.0 1.1" \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${train_dev}" \
+    --test_sets "${test_set}" \
+    --bpe_train_text "data/${train_set}/text" \
+    --lm_train_text "data/${train_set}/text" "$@"
diff --git a/egs2/fisher_callhome_spanish/asr1/scripts b/egs2/fisher_callhome_spanish/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/asr1/steps b/egs2/fisher_callhome_spanish/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/asr1/utils b/egs2/fisher_callhome_spanish/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/st1/RESULT.md b/egs2/fisher_callhome_spanish/st1/RESULT.md
new file mode 100644
index 00000000000..6efdcb6d5ef
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/RESULT.md
@@ -0,0 +1,15 @@
+# NOTE: apostrophe is included both in hyp and ref
+
+# Summary (4-gram BLEU)
+
+| model                                                         | fisher_dev | fisher_dev2 | fisher_test | callhome_devtest | callhome_evltest |
+| ------------------------------------------------------------- | ---------- | ----------- | ----------- | ---------------- | ---------------- |
+| RNN (char) [[Weiss et al.]](https://arxiv.org/abs/1703.08581) | 48.3       | 49.1        | 48.7        | 16.8             | 17.4             |
+| Transformer (BPE1k(500ES,500EN)) + ASR-PT + SpecAugment       | 48.4       | 49.5        | 48.6        | 19.7             | 19.6             |
+| Conformer (BPE1k(500ES,500EN)) + ASR-PT + SpecAugment         | **51.8**   | **52.3**    | **50.5**    | **22.3**         | **21.7**         |
+
+# Summary (4-gram BLEU, no callhome training)
+
+| model                                                         | fisher_dev | fisher_dev2 | fisher_test | callhome_devtest | callhome_evltest |
+| ------------------------------------------------------------- | ---------- | ----------- | ----------- | ---------------- | ---------------- |
+| Transformer (BPE1k(500ES,500EN)) + SpecAugment                | 44.7       | 45.6        | 45.1        | 17.3             | 16.8             |
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/st1/cmd.sh b/egs2/fisher_callhome_spanish/st1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/fisher_callhome_spanish/st1/conf/decode_st.yaml b/egs2/fisher_callhome_spanish/st1/conf/decode_st.yaml
new file mode 100644
index 00000000000..2967ee6fc0f
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/conf/decode_st.yaml
@@ -0,0 +1,5 @@
+batch_size: 1
+beam_size: 10
+nbest: 1
+lm_weight: 0.0
+
diff --git a/egs2/fisher_callhome_spanish/st1/conf/decode_streaming_st.yaml b/egs2/fisher_callhome_spanish/st1/conf/decode_streaming_st.yaml
new file mode 100644
index 00000000000..9bbdb6347bd
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/conf/decode_streaming_st.yaml
@@ -0,0 +1,5 @@
+batch_size: 1
+beam_size: 10
+nbest: 1
+lm_weight: 0.0
+sim_chunk_length: 5120
diff --git a/egs2/fisher_callhome_spanish/st1/conf/fbank.conf b/egs2/fisher_callhome_spanish/st1/conf/fbank.conf
new file mode 100644
index 00000000000..d75ddde4df8
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=8000 
+--num-mel-bins=80
diff --git a/egs2/fisher_callhome_spanish/st1/conf/pbs.conf b/egs2/fisher_callhome_spanish/st1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/fisher_callhome_spanish/st1/conf/pitch.conf b/egs2/fisher_callhome_spanish/st1/conf/pitch.conf
new file mode 100644
index 00000000000..926bcfca92a
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=8000
diff --git a/egs2/fisher_callhome_spanish/st1/conf/queue.conf b/egs2/fisher_callhome_spanish/st1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/fisher_callhome_spanish/st1/conf/slurm.conf b/egs2/fisher_callhome_spanish/st1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/fisher_callhome_spanish/st1/conf/train_st.yaml b/egs2/fisher_callhome_spanish/st1/conf/train_st.yaml
new file mode 120000
index 00000000000..2f41337f023
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/conf/train_st.yaml
@@ -0,0 +1 @@
+tuning/train_conformer_st.yaml
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/st1/conf/train_st_streaming.yaml b/egs2/fisher_callhome_spanish/st1/conf/train_st_streaming.yaml
new file mode 100644
index 00000000000..35c149ded9b
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/conf/train_st_streaming.yaml
@@ -0,0 +1,95 @@
+# network architecture
+
+# frontend related
+frontend: default
+frontend_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 160
+
+# encoder related
+# encoder related
+encoder: contextual_block_transformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    block_size: 40
+    hop_size: 16
+    look_ahead: 16
+    init_average: true
+    ctx_pos_enc: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: embed
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+extra_asr_decoder: transformer
+extra_asr_decoder_conf:
+    input_layer: embed
+    num_blocks: 2
+    linear_units: 2048
+    dropout_rate: 0.1
+
+extra_mt_decoder: transformer
+extra_mt_decoder_conf:
+    input_layer: embed
+    num_blocks: 2
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# loss related
+model_conf:
+    asr_weight: 0.3
+    mt_weight: 0.0
+    mtlalpha: 1.0
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# optimization related
+optim: adam
+accum_grad: 1
+grad_clip: 3
+max_epoch: 50
+optim_conf:
+    lr: 2.5
+scheduler: noamlr
+scheduler_conf:
+    model_size: 256
+    warmup_steps: 25000
+
+# minibatch related
+batch_type: folded
+batch_size: 128
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+num_att_plot: 0
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/fisher_callhome_spanish/st1/conf/tuning/train_conformer_st.yaml b/egs2/fisher_callhome_spanish/st1/conf/tuning/train_conformer_st.yaml
new file mode 100644
index 00000000000..a1d782bfc18
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/conf/tuning/train_conformer_st.yaml
@@ -0,0 +1,96 @@
+# network architecture
+
+# frontend related
+frontend: default
+frontend_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 160
+
+# encoder related
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: embed
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+extra_asr_decoder: transformer
+extra_asr_decoder_conf:
+    input_layer: embed
+    num_blocks: 2
+    linear_units: 2048
+    dropout_rate: 0.1
+
+extra_mt_decoder: transformer
+extra_mt_decoder_conf:
+    input_layer: embed
+    num_blocks: 2
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# loss related
+model_conf:
+    asr_weight: 0.3
+    mt_weight: 0.0
+    mtlalpha: 1.0
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# optimization related
+optim: adam
+accum_grad: 2
+grad_clip: 3
+max_epoch: 50
+optim_conf:
+    lr: 2.5
+scheduler: noamlr
+scheduler_conf:
+    model_size: 256
+    warmup_steps: 25000
+
+# minibatch related
+batch_type: folded
+batch_bins: 64
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/fisher_callhome_spanish/st1/conf/tuning/train_transformer_st.yaml b/egs2/fisher_callhome_spanish/st1/conf/tuning/train_transformer_st.yaml
new file mode 100644
index 00000000000..168a6a7a174
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/conf/tuning/train_transformer_st.yaml
@@ -0,0 +1,87 @@
+# network architecture
+
+# frontend related
+frontend: default
+frontend_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 160
+
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: conv2d
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: embed
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+extra_asr_decoder: transformer
+extra_asr_decoder_conf:
+    input_layer: embed
+    num_blocks: 2
+    linear_units: 2048
+    dropout_rate: 0.1
+
+extra_mt_decoder: transformer
+extra_mt_decoder_conf:
+    input_layer: embed
+    num_blocks: 2
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# loss related
+model_conf:
+    asr_weight: 0.3
+    mt_weight: 0.0
+    mtlalpha: 1.0
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# optimization related
+optim: adam
+accum_grad: 2
+grad_clip: 3
+max_epoch: 50
+optim_conf:
+    lr: 2.5
+scheduler: noamlr
+scheduler_conf:
+    model_size: 256
+    warmup_steps: 25000
+
+# minibatch related
+batch_type: folded
+batch_bins: 64
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/fisher_callhome_spanish/st1/db.sh b/egs2/fisher_callhome_spanish/st1/db.sh
new file mode 120000
index 00000000000..39ab78266e5
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/st1/db.sh
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/st1/local/callhome_create_splits.sh b/egs2/fisher_callhome_spanish/st1/local/callhome_create_splits.sh
new file mode 120000
index 00000000000..cb162d8e875
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/local/callhome_create_splits.sh
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/callhome_create_splits.sh
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/st1/local/callhome_data_prep.sh b/egs2/fisher_callhome_spanish/st1/local/callhome_data_prep.sh
new file mode 120000
index 00000000000..f7c7948603c
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/local/callhome_data_prep.sh
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/callhome_data_prep.sh
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/st1/local/callhome_make_spk2gender.py b/egs2/fisher_callhome_spanish/st1/local/callhome_make_spk2gender.py
new file mode 120000
index 00000000000..d465057b745
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/local/callhome_make_spk2gender.py
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/callhome_make_spk2gender.py
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/st1/local/callhome_make_trans.pl b/egs2/fisher_callhome_spanish/st1/local/callhome_make_trans.pl
new file mode 120000
index 00000000000..88959b1138f
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/local/callhome_make_trans.pl
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/callhome_make_trans.pl
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/st1/local/concat_short_utt.py b/egs2/fisher_callhome_spanish/st1/local/concat_short_utt.py
new file mode 120000
index 00000000000..f0566e73a01
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/local/concat_short_utt.py
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/concat_short_utt.py
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/st1/local/create_splits.sh b/egs2/fisher_callhome_spanish/st1/local/create_splits.sh
new file mode 120000
index 00000000000..d2e87a8a129
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/local/create_splits.sh
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/create_splits.sh
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/st1/local/data.sh b/egs2/fisher_callhome_spanish/st1/local/data.sh
new file mode 100755
index 00000000000..92c3c9be50e
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/local/data.sh
@@ -0,0 +1,87 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Carnegie Mellon University (Jiatong Shi)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+# general configuration
+stage=0       # start from 0 if you need to start from data preparation
+stop_stage=100
+SECONDS=0
+
+
+ . utils/parse_options.sh || exit 1;
+
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+mkdir -p ${FISHER_CALLHOME_SPANISH}
+if [ -z "${FISHER_CALLHOME_SPANISH}" ]; then
+    log "Fill the value of 'FISHER_CALLHOME_SPANISH' of db.sh"
+    exit 1
+fi
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+# Assume the file structures as
+# - ${FISHER_CALLHOME_SPANISH}
+#     - LDC2010S01 # (for fisher speech)
+#     - LDC2010T04 # (for fisher transcripts)
+#     - LDC96S35   # (for callhome speech)
+#     - LDC96T17   # (for callhome transcripts)
+
+sfisher_speech=${FISHER_CALLHOME_SPANISH}/LDC2010S01
+sfisher_transcripts=${FISHER_CALLHOME_SPANISH}/LDC2010T04
+split=local/splits/split_fisher
+callhome_speech=${FISHER_CALLHOME_SPANISH}/LDC96S35
+callhome_transcripts=${FISHER_CALLHOME_SPANISH}/LDC96T17
+split_callhome=local/splits/split_callhome
+
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "Make sure you have fisher_callhome_spanish at ${sfisher_speech}, ${sfisher_transcripts}, \
+             ${callhome_speech}, ${callhome_transcripts}"
+    log "stage 0: Data Preparation"
+    local/fsp_data_prep.sh ${sfisher_speech} ${sfisher_transcripts}
+    local/callhome_data_prep.sh ${callhome_speech} ${callhome_transcripts}
+
+    # split data
+    local/create_splits.sh ${split}
+    local/callhome_create_splits.sh ${split_callhome}
+
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: Concatenate Multiple Utterances"
+
+    # concatenate multiple utterances
+    local/normalize_trans.sh ${sfisher_transcripts} ${callhome_transcripts}
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: Combine train and dev set"
+
+    # skip the combination to have same condition to previous result
+    # utils/combine_data.sh \
+    #     --extra-files "text.lc.en text.lc.es text.lc.rm.en text.lc.rm.es text.tc.en text.tc.es" \
+    #     data/train \
+    #     data/fisher_train data/callhome_train/ 
+
+    cp -r data/fisher_train data/train
+    cp -r data/fisher_dev data/dev
+    rm data/dev/text.*.en
+    # Use 1 reference as dev set
+    cp data/fisher_dev/text.lc.en.0 data/dev/text.lc.en
+    cp data/fisher_dev/text.tc.en.0 data/dev/text.tc.en
+    cp data/fisher_dev/text.lc.rm.en.0 data/dev/text.lc.rm.en
+fi
diff --git a/egs2/fisher_callhome_spanish/st1/local/fsp_data_prep.sh b/egs2/fisher_callhome_spanish/st1/local/fsp_data_prep.sh
new file mode 120000
index 00000000000..8b01fc5da76
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/local/fsp_data_prep.sh
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/fsp_data_prep.sh
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/st1/local/fsp_make_spk2gender.py b/egs2/fisher_callhome_spanish/st1/local/fsp_make_spk2gender.py
new file mode 120000
index 00000000000..2483eee0bf8
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/local/fsp_make_spk2gender.py
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/fsp_make_spk2gender.py
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/st1/local/fsp_make_trans.pl b/egs2/fisher_callhome_spanish/st1/local/fsp_make_trans.pl
new file mode 120000
index 00000000000..67a714c738c
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/local/fsp_make_trans.pl
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/fsp_make_trans.pl
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/st1/local/normalize_punctuation.pl b/egs2/fisher_callhome_spanish/st1/local/normalize_punctuation.pl
new file mode 120000
index 00000000000..3235cbb3b86
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/local/normalize_punctuation.pl
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/normalize_punctuation.pl
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/st1/local/normalize_trans.sh b/egs2/fisher_callhome_spanish/st1/local/normalize_trans.sh
new file mode 120000
index 00000000000..9f03ca409ff
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/local/normalize_trans.sh
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/normalize_trans.sh
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/st1/local/path.sh b/egs2/fisher_callhome_spanish/st1/local/path.sh
new file mode 100644
index 00000000000..5733ad5eb5b
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/local/path.sh
@@ -0,0 +1,8 @@
+
+# check extra module installation
+if ! command -v tokenizer.perl > /dev/null; then
+    echo "Error: it seems that moses is not installed." >&2
+    echo "Error: please install moses as follows." >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make moses.done" >&2
+    return 1
+fi
diff --git a/egs2/fisher_callhome_spanish/st1/local/score_bleu.sh b/egs2/fisher_callhome_spanish/st1/local/score_bleu.sh
new file mode 120000
index 00000000000..5a5a06ea3d2
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/local/score_bleu.sh
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/score_bleu.sh
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/st1/local/splits b/egs2/fisher_callhome_spanish/st1/local/splits
new file mode 120000
index 00000000000..0156cd08892
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/local/splits
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/splits
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/st1/path.sh b/egs2/fisher_callhome_spanish/st1/path.sh
new file mode 120000
index 00000000000..8e43dca7d4d
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/st1/path.sh
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/st1/pyscripts b/egs2/fisher_callhome_spanish/st1/pyscripts
new file mode 120000
index 00000000000..f8eb6b2ab7b
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/st1/pyscripts
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/st1/run.sh b/egs2/fisher_callhome_spanish/st1/run.sh
new file mode 100755
index 00000000000..e07cb996cc6
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/run.sh
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+src_lang=es
+tgt_lang=en
+
+train_set=train
+train_dev=dev
+test_set="fisher_dev fisher_dev2 fisher_test callhome_devtest callhome_evltest"
+
+st_config=conf/train_st.yaml
+inference_config=conf/decode_st.yaml
+
+src_nbpe=500
+tgt_nbpe=500
+
+src_case=lc.rm
+tgt_case=lc.rm
+
+./st.sh \
+    --use_streaming false \
+    --local_data_opts "--stage 0" \
+    --audio_format "flac.ark" \
+    --use_lm false \
+    --token_joint false \
+    --nj 40 \
+    --fs 8k \
+    --src_lang ${src_lang} \
+    --tgt_lang ${tgt_lang} \
+    --src_token_type "bpe" \
+    --src_nbpe $src_nbpe \
+    --tgt_token_type "bpe" \
+    --tgt_nbpe $tgt_nbpe \
+    --src_case ${src_case} \
+    --tgt_case ${tgt_case} \
+    --feats_type raw \
+    --speed_perturb_factors "0.9 1.0 1.1" \
+    --st_config "${st_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${train_dev}" \
+    --test_sets "${test_set}" \
+    --src_bpe_train_text "data/${train_set}/text.${src_case}.${src_lang}" \
+    --tgt_bpe_train_text "data/${train_set}/text.${tgt_case}.${tgt_lang}" \
+    --lm_train_text "data/${train_set}/text.${tgt_case}.${tgt_lang}"  "$@"
diff --git a/egs2/fisher_callhome_spanish/st1/scripts b/egs2/fisher_callhome_spanish/st1/scripts
new file mode 120000
index 00000000000..c5b75bb5b0a
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/st1/scripts
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/st1/st.sh b/egs2/fisher_callhome_spanish/st1/st.sh
new file mode 120000
index 00000000000..5c7465739e3
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/st.sh
@@ -0,0 +1 @@
+../../TEMPLATE/st1/st.sh
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/st1/steps b/egs2/fisher_callhome_spanish/st1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/st1/utils b/egs2/fisher_callhome_spanish/st1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/fsc/asr1/conf/train_asr_streaming_transformer.yaml b/egs2/fsc/asr1/conf/train_asr_streaming_transformer.yaml
new file mode 100644
index 00000000000..7a344c514cf
--- /dev/null
+++ b/egs2/fsc/asr1/conf/train_asr_streaming_transformer.yaml
@@ -0,0 +1,58 @@
+# network architecture
+# encoder related
+encoder: contextual_block_transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+    block_size: 40
+    hop_size: 16
+    look_ahead: 16
+    init_average: true
+    ctx_pos_enc: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# minibatch related
+batch_type: folded
+batch_size: 64
+max_epoch: 200
+keep_nbest_models: 5
+
+optim: adam
+optim_conf:
+    lr: 0.0002
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
+num_att_plot: 0
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/googlei18n_lowresource/tts1/cmd.sh b/egs2/googlei18n_lowresource/tts1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/googlei18n_lowresource/tts1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/googlei18n_lowresource/tts1/conf/decode.yaml b/egs2/googlei18n_lowresource/tts1/conf/decode.yaml
new file mode 120000
index 00000000000..363baa875b7
--- /dev/null
+++ b/egs2/googlei18n_lowresource/tts1/conf/decode.yaml
@@ -0,0 +1 @@
+tuning/decode_tacotron2.yaml
\ No newline at end of file
diff --git a/egs2/googlei18n_lowresource/tts1/conf/mfcc.conf b/egs2/googlei18n_lowresource/tts1/conf/mfcc.conf
new file mode 100644
index 00000000000..9e125706aae
--- /dev/null
+++ b/egs2/googlei18n_lowresource/tts1/conf/mfcc.conf
@@ -0,0 +1,7 @@
+--sample-frequency=16000
+--frame-length=25 # the default is 25
+--low-freq=20 # the default.
+--high-freq=7600 # the default is zero meaning use the Nyquist (8k in this case).
+--num-mel-bins=30
+--num-ceps=30
+--snip-edges=false
diff --git a/egs2/googlei18n_lowresource/tts1/conf/pbs.conf b/egs2/googlei18n_lowresource/tts1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/googlei18n_lowresource/tts1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/googlei18n_lowresource/tts1/conf/queue.conf b/egs2/googlei18n_lowresource/tts1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/googlei18n_lowresource/tts1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/googlei18n_lowresource/tts1/conf/slurm.conf b/egs2/googlei18n_lowresource/tts1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/googlei18n_lowresource/tts1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/googlei18n_lowresource/tts1/conf/train.yaml b/egs2/googlei18n_lowresource/tts1/conf/train.yaml
new file mode 120000
index 00000000000..5825b613e30
--- /dev/null
+++ b/egs2/googlei18n_lowresource/tts1/conf/train.yaml
@@ -0,0 +1 @@
+tuning/train_gst+xvector_tacotron2.yaml
\ No newline at end of file
diff --git a/egs2/googlei18n_lowresource/tts1/conf/tuning/decode_fastspeech.yaml b/egs2/googlei18n_lowresource/tts1/conf/tuning/decode_fastspeech.yaml
new file mode 100644
index 00000000000..cc8a791ef74
--- /dev/null
+++ b/egs2/googlei18n_lowresource/tts1/conf/tuning/decode_fastspeech.yaml
@@ -0,0 +1,10 @@
+# This configuration is the decoding setting for FastSpeech or FastSpeech2.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+speed_control_alpha: 1     # alpha to control the speed of generated speech
+                           # 1 < alpha makes slower and 1 > alpha makes faster
+use_teacher_forcing: false # whether to use teacher forcing
+                           # if true, we use groundtruth of durations
+                           # (+ pitch & energy for FastSpeech2)
diff --git a/egs2/googlei18n_lowresource/tts1/conf/tuning/decode_tacotron2.yaml b/egs2/googlei18n_lowresource/tts1/conf/tuning/decode_tacotron2.yaml
new file mode 100644
index 00000000000..ed39fb610e8
--- /dev/null
+++ b/egs2/googlei18n_lowresource/tts1/conf/tuning/decode_tacotron2.yaml
@@ -0,0 +1,15 @@
+# This configuration is the basic decoding setting for Tacotron 2.
+# It can be also applied to Transformer. If you met some problems
+# such as deletions or repetitions, it is worthwhile to try
+# `use_att_constraint: true` to make the generation more stable.
+# Note that attention constraint is not supported in Transformer.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+threshold: 0.5            # threshold to stop the generation
+maxlenratio: 10.0         # maximum length of generated samples = input length * maxlenratio
+minlenratio: 0.0          # minimum length of generated samples = input length * minlenratio
+use_att_constraint: false # Whether to use attention constraint, which is introduced in Deep Voice 3
+backward_window: 1        # Backward window size in the attention constraint
+forward_window: 3         # Forward window size in the attention constraint
diff --git a/egs2/googlei18n_lowresource/tts1/conf/tuning/train_gst+xvector_conformer_fastspeech2.yaml b/egs2/googlei18n_lowresource/tts1/conf/tuning/train_gst+xvector_conformer_fastspeech2.yaml
new file mode 100644
index 00000000000..a6b8d59d422
--- /dev/null
+++ b/egs2/googlei18n_lowresource/tts1/conf/tuning/train_gst+xvector_conformer_fastspeech2.yaml
@@ -0,0 +1,113 @@
+# This configuration is for ESPnet2 to train Conformer-based
+# FastSpeech2 with GST + X-vector. It requires 4 GPU with 32 GB
+# memory and it takes ~3 days to finish the training on V100.
+
+# Compared to the original FastSpeech2 paper, we use token
+# averaged pitch and energy as the same as FastPitch.
+# And we do not use quantized pitch and energy.
+
+# For FastSpeech2, we need to extract pitch and energy.
+# Therefore, we assume that feats_type=raw in using this
+# configuration. Please be careful.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: fastspeech2      # model architecture
+tts_conf:             # keyword arguments for the selected model
+    adim: 384         # attention dimension
+    aheads: 2         # number of attention heads
+    elayers: 4        # number of encoder layers
+    eunits: 1536      # number of encoder ff units
+    dlayers: 4        # number of decoder layers
+    dunits: 1536      # number of decoder ff units
+    positionwise_layer_type: conv1d   # type of position-wise layer
+    positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
+    duration_predictor_layers: 2      # number of layers of duration predictor
+    duration_predictor_chans: 256     # number of channels of duration predictor
+    duration_predictor_kernel_size: 3 # filter size of duration predictor
+    postnet_layers: 5                 # number of layers of postnset
+    postnet_filts: 5                  # filter size of conv layers in postnet
+    postnet_chans: 256                # number of channels of conv layers in postnet
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+    encoder_normalize_before: True    # whether to perform layer normalization before the input
+    decoder_normalize_before: True    # whether to perform layer normalization before the input
+    reduction_factor: 1               # reduction factor
+    encoder_type: conformer           # encoder type
+    decoder_type: conformer           # decoder type
+    conformer_pos_enc_layer_type: rel_pos        # conformer positional encoding type
+    conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type
+    conformer_activation_type: swish             # conformer activation type
+    use_macaron_style_in_conformer: true         # whether to use macaron style in conformer
+    use_cnn_in_conformer: true                   # whether to use CNN in conformer
+    conformer_enc_kernel_size: 7                 # kernel size in CNN module of conformer-based encoder
+    conformer_dec_kernel_size: 31                # kernel size in CNN module of conformer-based decoder
+    init_type: xavier_uniform                    # initialization type
+    transformer_enc_dropout_rate: 0.2            # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.2       # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.2            # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.2       # dropout rate for transformer decoder attention layer
+    pitch_predictor_layers: 5                  # number of conv layers in pitch predictor
+    pitch_predictor_chans: 256                 # number of channels of conv layers in pitch predictor
+    pitch_predictor_kernel_size: 5             # kernel size of conv leyers in pitch predictor
+    pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
+    pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
+    pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
+    stop_gradient_from_pitch_predictor: true   # whether to stop the gradient from pitch predictor to encoder
+    energy_predictor_layers: 2                 # number of conv layers in energy predictor
+    energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
+    energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
+    energy_predictor_dropout: 0.5              # dropout rate in energy predictor
+    energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
+    energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
+    stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+    spk_embed_dim: 512                         # dimension of speaker embedding
+    spk_embed_integration_type: add            # how to integrate speaker embedding
+    use_gst: true                              # whether to use GST embedding
+    gst_heads: 4                               # number of heads in GST multi-head attention
+    gst_tokens: 16                             # number of global style tokens
+
+# extra module for additional inputs
+pitch_extract: dio           # pitch extractor type
+pitch_normalize: global_mvn  # normalizer for the pitch feature
+energy_extract: energy       # energy extractor type
+energy_normalize: global_mvn # normalizer for the energy feature
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam            # optimizer type
+optim_conf:            # keyword arguments for selected optimizer
+    lr: 1.0            # learning rate
+scheduler: noamlr      # scheduler type
+scheduler_conf:        # keyword arguments for selected scheduler
+    model_size: 384    # model size, a.k.a., attention dimension
+    warmup_steps: 4000 # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 500  # number of iterations per epoch
+max_epoch: 500            # number of epochs
+grad_clip: 1.0            # gradient clipping norm
+grad_noise: false         # whether to use gradient noise injection
+accum_grad: 1             # gradient accumulation
+batch_bins: 18000000      # batch bins (feats_type=raw)
+batch_type: numel         # how to make batch
+sort_in_batch: descending # how to sort data in making batch
+sort_batch: descending    # how to sort created batches
+num_workers: 1            # number of workers of data loader
+train_dtype: float32      # dtype in training
+log_interval: null        # log interval in iterations
+keep_nbest_models: 5      # number of models to keep
+num_att_plot: 3           # number of attention figures to be saved in every check
+seed: 0                   # random seed number
+best_model_criterion:     # criterion to save the best models
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/googlei18n_lowresource/tts1/conf/tuning/train_gst+xvector_tacotron2.yaml b/egs2/googlei18n_lowresource/tts1/conf/tuning/train_gst+xvector_tacotron2.yaml
new file mode 100644
index 00000000000..6065c914c39
--- /dev/null
+++ b/egs2/googlei18n_lowresource/tts1/conf/tuning/train_gst+xvector_tacotron2.yaml
@@ -0,0 +1,81 @@
+# This configuration is for ESPnet2 to train Tacotron 2 with
+# GST + X-vector. This configuration additionally use the guided
+# attention loss to accelerate the diagonal attention learning.
+# It takes around 4 days to finish the training on V100.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: tacotron2                   # model architecture
+tts_conf:                        # keyword arguments for the selected model
+    embed_dim: 512               # char or phn embedding dimension
+    elayers: 1                   # number of blstm layers in encoder
+    eunits: 512                  # number of blstm units
+    econv_layers: 3              # number of convolutional layers in encoder
+    econv_chans: 512             # number of channels in convolutional layer
+    econv_filts: 5               # filter size of convolutional layer
+    atype: location              # attention function type
+    adim: 512                    # attention dimension
+    aconv_chans: 32              # number of channels in convolutional layer of attention
+    aconv_filts: 15              # filter size of convolutional layer of attention
+    cumulate_att_w: true         # whether to cumulate attention weight
+    dlayers: 2                   # number of lstm layers in decoder
+    dunits: 1024                 # number of lstm units in decoder
+    prenet_layers: 2             # number of layers in prenet
+    prenet_units: 256            # number of units in prenet
+    postnet_layers: 5            # number of layers in postnet
+    postnet_chans: 512           # number of channels in postnet
+    postnet_filts: 5             # filter size of postnet layer
+    output_activation: null      # activation function for the final output
+    use_batch_norm: true         # whether to use batch normalization in encoder
+    use_concate: true            # whether to concatenate encoder embedding with decoder outputs
+    use_residual: false          # whether to use residual connection in encoder
+    spk_embed_dim: 512              # dimension of speaker embedding
+    spk_embed_integration_type: add # how to integrate speaker embedding
+    use_gst: true                # whether to use GST embedding
+    gst_heads: 4                 # number of heads in GST multi-head attention
+    gst_tokens: 16               # number of global style tokens
+    dropout_rate: 0.5            # dropout rate
+    zoneout_rate: 0.1            # zoneout rate
+    reduction_factor: 1          # reduction factor
+    use_masking: true            # whether to apply masking for padded part in loss calculation
+    bce_pos_weight: 10.0         # weight of positive sample in binary cross entropy calculation
+    use_guided_attn_loss: true   # whether to use guided attention loss
+    guided_attn_loss_sigma: 0.4  # sigma of guided attention loss
+    guided_attn_loss_lambda: 1.0 # strength of guided attention loss
+
+##########################################################
+#                  OPTIMIZER SETTING                     #
+##########################################################
+optim: adam           # optimizer type
+optim_conf:           # keyword arguments for selected optimizer
+    lr: 1.0e-03       # learning rate
+    eps: 1.0e-06      # epsilon
+    weight_decay: 0.0 # weight decay coefficient
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 500    # number of iterations per epoch
+max_epoch: 500              # number of epochs
+grad_clip: 1.0              # gradient clipping norm
+grad_noise: false           # whether to use gradient noise injection
+accum_grad: 1               # gradient accumulation
+# batch_bins: 1000000       # batch bins (for feats_type=fbank)
+batch_bins: 3750000         # batch bins (for feats_type=raw, *= n_shift / n_mels)
+batch_type: numel           # how to make batch
+sort_in_batch: descending   # how to sort data in making batch
+sort_batch: descending      # how to sort created batches
+num_workers: 1              # number of workers of data loader
+train_dtype: float32        # dtype in training
+log_interval: null          # log interval in iterations
+keep_nbest_models: 5        # number of models to keep
+num_att_plot: 3             # number of attention figures to be saved in every check
+seed: 0                     # random seed number
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/googlei18n_lowresource/tts1/conf/tuning/train_gst+xvector_transformer.yaml b/egs2/googlei18n_lowresource/tts1/conf/tuning/train_gst+xvector_transformer.yaml
new file mode 100644
index 00000000000..737a26960d4
--- /dev/null
+++ b/egs2/googlei18n_lowresource/tts1/conf/tuning/train_gst+xvector_transformer.yaml
@@ -0,0 +1,96 @@
+# This configuration is for ESPnet2 to train Transformer-TTS with
+# GST + X-vector. This configuration additionally use the guided
+# attention loss to accelerate the learning of the diagonal attention.
+# It requires 4 GPUs with 32 GB memory and it takes around 3 days
+# to finish the training on Tesla V100.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: transformer           # model architecture
+tts_conf:                  # keyword arguments for the selected model
+    embed_dim: 0           # embedding dimension in encoder prenet
+    eprenet_conv_layers: 0 # number of conv layers in encoder prenet
+                           # if set to 0, no encoder prenet will be used
+    eprenet_conv_filts: 0  # filter size of conv layers in encoder prenet
+    eprenet_conv_chans: 0  # number of channels of conv layers in encoder prenet
+    dprenet_layers: 2      # number of layers in decoder prenet
+    dprenet_units: 256     # number of units in decoder prenet
+    adim: 512              # attention dimension
+    aheads: 8              # number of attention heads
+    elayers: 6             # number of encoder layers
+    eunits: 1024           # number of encoder ff units
+    dlayers: 6             # number of decoder layers
+    dunits: 1024           # number of decoder ff units
+    positionwise_layer_type: conv1d  # type of position-wise layer
+    positionwise_conv_kernel_size: 1 # kernel size of position wise conv layer
+    postnet_layers: 5                # number of layers of postnset
+    postnet_filts: 5                 # filter size of conv layers in postnet
+    postnet_chans: 256               # number of channels of conv layers in postnet
+    spk_embed_dim: 512               # dimension of speaker embedding
+    spk_embed_integration_type: add  # how to integrate speaker embedding
+    use_gst: true                    # whether to use GST embedding
+    gst_heads: 4                     # number of heads in GST multi-head attention
+    gst_tokens: 16                   # number of global style tokens
+    use_masking: True                # whether to apply masking for padded part in loss calculation
+    bce_pos_weight: 5.0              # weight of positive sample in binary cross entropy calculation
+    use_scaled_pos_enc: True         # whether to use scaled positional encoding
+    encoder_normalize_before: True   # whether to perform layer normalization before the input
+    decoder_normalize_before: True   # whether to perform layer normalization before the input
+    reduction_factor: 1              # reduction factor
+    init_type: xavier_uniform        # initialization type
+    init_enc_alpha: 1.0              # initial value of alpha of encoder scaled position encoding
+    init_dec_alpha: 1.0              # initial value of alpha of decoder scaled position encoding
+    eprenet_dropout_rate: 0.0        # dropout rate for encoder prenet
+    dprenet_dropout_rate: 0.5        # dropout rate for decoder prenet
+    postnet_dropout_rate: 0.5        # dropout rate for postnet
+    transformer_enc_dropout_rate: 0.1                # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.1     # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.1           # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.1                # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.1     # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.1           # dropout rate for transformer decoder attention layer
+    transformer_enc_dec_attn_dropout_rate: 0.1       # dropout rate for transformer encoder-decoder attention layer
+    use_guided_attn_loss: true                       # whether to use guided attention loss
+    num_heads_applied_guided_attn: 2                 # number of layers to apply guided attention loss
+    num_layers_applied_guided_attn: 2                # number of heads to apply guided attention loss
+    modules_applied_guided_attn: ["encoder-decoder"] # modules to apply guided attention loss
+    guided_attn_loss_sigma: 0.4                      # sigma in guided attention loss
+    guided_attn_loss_lambda: 10.0                    # lambda in guided attention loss
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam             # optimizer type
+optim_conf:             # keyword arguments for selected optimizer
+    lr: 1.0             # learning rate
+scheduler: noamlr       # scheduler type
+scheduler_conf:         # keyword arguments for selected scheduler
+    model_size: 512     # model size, a.k.a., attention dimenstion
+    warmup_steps: 12000 # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 1000   # number of iterations per epoch
+max_epoch: 500              # number of epochs
+grad_clip: 1.0              # gradient clipping norm
+grad_noise: false           # whether to use gradient noise injection
+accum_grad: 2               # gradient accumulation
+batch_bins: 9000000         # batch bins (for feats_type=raw, *= n_shift / n_mels)
+batch_type: numel           # how to make batch
+sort_in_batch: descending   # how to sort data in making batch
+sort_batch: descending      # how to sort created batches
+num_workers: 1              # number of workers of data loader
+train_dtype: float32        # dtype in training
+log_interval: null          # log interval in iterations
+keep_nbest_models: 5        # number of models to keep
+num_att_plot: 3             # number of attention figures to be saved in every check
+seed: 0                     # random seed number
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/googlei18n_lowresource/tts1/conf/vad.conf b/egs2/googlei18n_lowresource/tts1/conf/vad.conf
new file mode 100644
index 00000000000..c9f5e8b3072
--- /dev/null
+++ b/egs2/googlei18n_lowresource/tts1/conf/vad.conf
@@ -0,0 +1,4 @@
+--vad-energy-threshold=5.5
+--vad-energy-mean-scale=0.5
+--vad-proportion-threshold=0.12
+--vad-frames-context=2
diff --git a/egs2/googlei18n_lowresource/tts1/db.sh b/egs2/googlei18n_lowresource/tts1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/googlei18n_lowresource/tts1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/googlei18n_lowresource/tts1/local/data.sh b/egs2/googlei18n_lowresource/tts1/local/data.sh
new file mode 100755
index 00000000000..4e1343a15ac
--- /dev/null
+++ b/egs2/googlei18n_lowresource/tts1/local/data.sh
@@ -0,0 +1,109 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+stage=1
+stop_stage=3
+threshold=35
+sex=both
+lang=es_ar
+openslr_id=61
+nj=40
+
+log "$0 $*"
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+if [ -z "${GOOGLEI18N}" ]; then
+   log "Fill the value of 'GOOGLEI18N' of db.sh"
+   exit 1
+fi
+mkdir -p ${GOOGLEI18N}
+db_root=${GOOGLEI18N}
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "stage -1: download data from openslr"
+    if [[ "${sex}" == female  ]]; then
+        local/download_and_unzip.sh "${db_root}" "https://www.openslr.org/resources/${openslr_id}/${lang}_female.zip" ${lang}_female.zip
+        wget -O local/line_index_female.tsv "https://www.openslr.org/resources/${openslr_id}/line_index_female.tsv"
+        mv local/line_index_female.tsv local/index.tsv
+    elif [[ "${sex}" == male  ]]; then
+        local/download_and_unzip.sh "${db_root}" "https://www.openslr.org/resources/${openslr_id}/${lang}_male.zip" ${lang}_male.zip
+        wget -O local/line_index_male.tsv "https://www.openslr.org/resources/${openslr_id}/line_index_male.tsv"
+        mv local/line_index_male.tsv local/index.tsv
+    else
+        # local/download_and_unzip.sh "${db_root}" "https://www.openslr.org/resources/${openslr_id}/${lang}_male.zip" ${lang}_male.zip
+        # local/download_and_unzip.sh "${db_root}" "https://www.openslr.org/resources/${openslr_id}/${lang}_female.zip" ${lang}_female.zip
+        wget -O local/line_index_female.tsv "https://www.openslr.org/resources/${openslr_id}/line_index_female.tsv"
+        wget -O local/line_index_male.tsv "https://www.openslr.org/resources/${openslr_id}/line_index_male.tsv"
+        cat local/line_index_male.tsv local/line_index_female.tsv > local/index.tsv
+    fi
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: prepare crowdsourced data"
+    mkdir -p data
+    mkdir -p data/${lang}
+    log "generate utt2spk"
+    awk -F '[_\t]' '{print $1 "_" $2 "_" $3 " " $1 "_" $2}' local/index.tsv > data/${lang}/utt2spk
+    log "generate text"
+    cp local/index.tsv data/${lang}/text
+    log "generate wav.scp"
+    awk -F "\t" -v db=${db_root} '{print $1 " " db}' local/index.tsv > data/${lang}/wav.scp
+    log "sorting"
+    sort data/${lang}/utt2spk -o data/${lang}/utt2spk
+    sort data/${lang}/wav.scp -o data/${lang}/wav.scp
+    sort data/${lang}/text -o data/${lang}/text
+    utils/utt2spk_to_spk2utt.pl data/${lang}/utt2spk > data/${lang}/spk2utt
+    utils/validate_data_dir.sh --no-feats data/${lang}
+
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: scripts/audio/trim_silence.sh"
+        # shellcheck disable=SC2154
+        scripts/audio/trim_silence.sh \
+             --cmd "${train_cmd}" \
+             --nj "${nj}" \
+             --fs 44100 \
+             --win_length 2048 \
+             --shift_length 512 \
+             --threshold "${threshold}" \
+             data/${lang} data/${lang}/log
+
+        utils/fix_data_dir.sh data/${lang}
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    log "stage 3: split for development set"
+    utils/subset_data_dir.sh data/${lang} 500 data/dev-test-${lang}
+    utils/subset_data_dir.sh data/dev-test-${lang} 250 data/dev_${lang}
+    utils/copy_data_dir.sh data/dev-test-${lang} data/test_${lang}
+    utils/filter_scp.pl --exclude data/dev_${lang}/wav.scp 
+        data/dev-test-${lang}/wav.scp > data/test_${lang}/wav.scp
+    utils/fix_data_dir.sh data/test_${lang}
+
+    utils/copy_data_dir.sh data/${lang} data/train_${lang}
+    utils/filter_scp.pl --exclude data/dev-test-${lang}/wav.scp \
+        data/${lang}/wav.scp > data/train_${lang}/wav.scp
+    utils/fix_data_dir.sh data/train_${lang}/wav.scp
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
+
diff --git a/egs2/googlei18n_lowresource/tts1/local/download_and_unzip.sh b/egs2/googlei18n_lowresource/tts1/local/download_and_unzip.sh
new file mode 100755
index 00000000000..50afe0a5b4a
--- /dev/null
+++ b/egs2/googlei18n_lowresource/tts1/local/download_and_unzip.sh
@@ -0,0 +1,90 @@
+#!/usr/bin/env bash
+
+# Copyright   2014  Johns Hopkins University (author: Daniel Povey)
+#             2017  Luminar Technologies, Inc. (author: Daniel Galvez)
+#             2017  Ewald Enzinger
+# Apache 2.0
+
+# Adapted from egs/mini_librispeech/s5/local/download_and_untar.sh (commit 1cd6d2ac3a935009fdc4184cb8a72ddad98fe7d9)
+
+remove_archive=false
+filesize=19057141777 # data_aishell3.tgz  size
+
+if [ "$1" == --remove-archive ]; then
+  remove_archive=true
+  shift
+fi
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 [--remove-archive] <data-base> <url> <filename>"
+  echo "e.g.: $0 /export/data/ https://common-voice-data-download.s3.amazonaws.com/cv_corpus_v1.zip cv_corpus_v1.zip"
+  echo "With --remove-archive it will remove the archive after successfully un-zipping it."
+fi
+
+data=$1
+url=$2
+filename=$3
+filepath="$data/$filename"
+workspace=$PWD
+
+if [ ! -d "$data" ]; then
+  echo "$0: no such directory $data"
+  exit 1;
+fi
+
+if [ -z "$url" ]; then
+  echo "$0: empty URL."
+  exit 1;
+fi
+
+if [ -f $data/$filename.complete ]; then
+  echo "$0: data was already successfully extracted, nothing to do."
+  exit 0;
+fi
+
+
+if [ -f $filepath ]; then
+  size=$(/bin/ls -l $filepath | awk '{print $5}')
+  size_ok=false
+  if [ "$filesize" -eq "$size" ]; then size_ok=true; fi;
+  if ! $size_ok; then
+    echo "$0: removing existing file $filepath because its size in bytes ($size)"
+    echo "does not equal the size of the archives ($filesize)."
+    rm $filepath
+  else
+    echo "$filepath exists and appears to be complete."
+  fi
+fi
+
+if [ ! -f $filepath ]; then
+  if ! which wget >/dev/null; then
+    echo "$0: wget is not installed."
+    exit 1;
+  fi
+  echo "$0: downloading data from $url.  This may take some time, please be patient."
+
+  cd $data
+  if ! wget --no-check-certificate $url; then
+    echo "$0: error executing wget $url"
+    exit 1;
+  fi
+  cd $workspace
+fi
+
+cd $data
+
+if ! unzip $filename; then
+  echo "$0: error un-zipping archive $filepath"
+  exit 1;
+fi
+
+cd $workspace
+
+touch $data/$filename.complete
+
+echo "$0: Successfully downloaded and un-zipped $filepath"
+
+if $remove_archive; then
+  echo "$0: removing $filepath file since --remove-archive option was supplied."
+  rm $filepath
+fi
diff --git a/egs2/googlei18n_lowresource/tts1/local/path.sh b/egs2/googlei18n_lowresource/tts1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/googlei18n_lowresource/tts1/run.sh b/egs2/googlei18n_lowresource/tts1/run.sh
new file mode 100755
index 00000000000..f820ec79c3e
--- /dev/null
+++ b/egs2/googlei18n_lowresource/tts1/run.sh
@@ -0,0 +1,79 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+fs=24000
+n_fft=2048
+n_shift=300
+win_length=1200
+
+opts=
+if [ "${fs}" -eq 44100 ]; then
+    # To suppress recreation, specify wav format
+    opts="--audio_format wav "
+else
+    opts="--audio_format flac "
+fi
+
+lang="es_ar"
+openslr_id=61
+sex=both
+
+# lang_id: lang full name (openslr_id) female/male/both
+# es_ar: Argentinian Spanish (61) both
+# ml_in: Malayalam (63) both
+# mr_in: Marathi (64) female
+# ta_in: Tamil (65) both
+# te_in: Telugu (66) both
+# ca_es: Catalan (69) both
+# en_ng: Nigerian (70) both
+# es_cl: Chilean (71) both
+# es_co: Colombian (72) both
+# es_pe: Peruvian (73) both
+# es_pr: Puerto Rico Spanish (74) female
+# es_ve: Venezuelan Spanish (75) both
+# eu_es: Basque (76) both
+# gl_es: Galician (77) both
+# gu_in: Gujarati (78) both
+# kn_in: Kannada (79) both
+# my_mm: Burmese (80) female
+# irish_english: Ireland English (83) male
+# midlands_english: Midland English (83) both
+# northern_english: Northern English (83) both
+# scottish_english: Scottish English (83) both
+# southern_english: Southern English (83) both
+# welsh_english: Welsh English (83) both
+# yo_ng: Yoruba (86) both
+
+train_config=conf/train.yaml
+inference_config=conf/decode.yaml
+
+train_set=train_no_dev_${lang}
+valid_set=dev_${lang}
+test_sets="dev_${lang} test_${lang}"
+
+# no g2p for crowdsourced languages
+g2p=none
+
+./tts.sh \
+    --lang ${lang} \
+    --local_data_opts "--lang ${lang} --openslr_id ${openslr_id} --sex ${sex}" \
+    --feats_type raw \
+    --fs "${fs}" \
+    --n_fft "${n_fft}" \
+    --n_shift "${n_shift}" \
+    --win_length "${win_length}" \
+    --token_type raw \
+    --cleaner none \
+    --g2p "${g2p}" \
+    --train_config "${train_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --srctexts "data/${train_set}/text" \
+    --use_xvector true \
+    ${opts} "$@"
diff --git a/egs2/googlei18n_lowresource/tts1/sid b/egs2/googlei18n_lowresource/tts1/sid
new file mode 120000
index 00000000000..7c50273be92
--- /dev/null
+++ b/egs2/googlei18n_lowresource/tts1/sid
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/sre08/v1/sid
\ No newline at end of file
diff --git a/egs2/googlei18n_lowresource/tts1/steps b/egs2/googlei18n_lowresource/tts1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/googlei18n_lowresource/tts1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/googlei18n_lowresource/tts1/tts.sh b/egs2/googlei18n_lowresource/tts1/tts.sh
new file mode 120000
index 00000000000..8f6f0cf1d99
--- /dev/null
+++ b/egs2/googlei18n_lowresource/tts1/tts.sh
@@ -0,0 +1 @@
+../../TEMPLATE/tts1/tts.sh
\ No newline at end of file
diff --git a/egs2/googlei18n_lowresource/tts1/utils b/egs2/googlei18n_lowresource/tts1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/googlei18n_lowresource/tts1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/harpervalley/asr1/README.md b/egs2/harpervalley/asr1/README.md
new file mode 100644
index 00000000000..e40e1a41589
--- /dev/null
+++ b/egs2/harpervalley/asr1/README.md
@@ -0,0 +1,28 @@
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+# RESULTS
+## Environments
+- date: `Sat Apr 23 12:20:53 EDT 2022`
+- python version: `3.7.11 (default, Jul 27 2021, 14:32:16)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.9.0+cu102`
+- Git hash: `0b1d15ebe0c36efcdf06d1b2e32361e3c8846cf6`
+  - Commit date: `Tue Apr 5 10:46:40 2022 -0400`
+- Pretrained Model: https://huggingface.co/espnet/YushiUeda_harpervalley_train_asr_hubert_raw_en_word
+
+## asr_train_asr_raw_en_word
+- config [conf/tuning/train_asr.yaml](conf/tuning/train_asr.yaml)
+- token_type word
+
+|dataset|Snt|Intent Classification (%)|
+|---|---|---|
+|inference_asr_model_valid.acc.ave_10best/test|1028|37.9|
+|inference_asr_model_valid.acc.ave_10best/valid|1430|35.5|
+
+## asr_train_asr_hubert_raw_en_word
+- config [conf/tuning/train_asr_hubert.yaml](conf/tuning/train_asr_hubert.yaml)
+- token_type word
+
+|dataset|Snt|Intent Classification (%)|
+|---|---|---|
+|inference_asr_model_valid.acc.ave_10best/test|1028|46.0|
+|inference_asr_model_valid.acc.ave_10best/valid|1430|46.6|
diff --git a/egs2/harpervalley/asr1/asr.sh b/egs2/harpervalley/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/harpervalley/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/harpervalley/asr1/cmd.sh b/egs2/harpervalley/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/harpervalley/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/harpervalley/asr1/conf/decode_asr.yaml b/egs2/harpervalley/asr1/conf/decode_asr.yaml
new file mode 100644
index 00000000000..e2408c35e82
--- /dev/null
+++ b/egs2/harpervalley/asr1/conf/decode_asr.yaml
@@ -0,0 +1 @@
+beam_size: 1
\ No newline at end of file
diff --git a/egs2/harpervalley/asr1/conf/fbank.conf b/egs2/harpervalley/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/harpervalley/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/harpervalley/asr1/conf/pbs.conf b/egs2/harpervalley/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/harpervalley/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/harpervalley/asr1/conf/pitch.conf b/egs2/harpervalley/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/harpervalley/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/harpervalley/asr1/conf/queue.conf b/egs2/harpervalley/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/harpervalley/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/harpervalley/asr1/conf/slurm.conf b/egs2/harpervalley/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/harpervalley/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/harpervalley/asr1/conf/train_asr.yaml b/egs2/harpervalley/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..a7463d73f0e
--- /dev/null
+++ b/egs2/harpervalley/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+tuning/train_asr.yaml
\ No newline at end of file
diff --git a/egs2/harpervalley/asr1/conf/tuning/train_asr.yaml b/egs2/harpervalley/asr1/conf/tuning/train_asr.yaml
new file mode 100644
index 00000000000..ac943f876ac
--- /dev/null
+++ b/egs2/harpervalley/asr1/conf/tuning/train_asr.yaml
@@ -0,0 +1,49 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+optim: adam
+optim_conf:
+    lr: 0.0002
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
+max_epoch: 50
+keep_nbest_models: 10
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+    
\ No newline at end of file
diff --git a/egs2/harpervalley/asr1/conf/tuning/train_asr_hubert.yaml b/egs2/harpervalley/asr1/conf/tuning/train_asr_hubert.yaml
new file mode 100644
index 00000000000..49867ef89b7
--- /dev/null
+++ b/egs2/harpervalley/asr1/conf/tuning/train_asr_hubert.yaml
@@ -0,0 +1,85 @@
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+optim: adam
+optim_conf:
+    lr: 0.0002
+scheduler: warmuplr     # pytorch v1.1.0+ required #Tune warmup steps
+scheduler_conf:
+    warmup_steps: 25000
+max_epoch: 50
+
+freeze_param: [
+"frontend.upstream"
+]
+
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+frontend: s3prl
+frontend_conf:
+    frontend_conf:
+        upstream: hubert_large_ll60k  # Note: If the upstream is changed, please change the input_size in the preencoder.
+    download_dir: ./hub
+    multilayer_feature: True
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 80
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
\ No newline at end of file
diff --git a/egs2/harpervalley/asr1/db.sh b/egs2/harpervalley/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/harpervalley/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/harpervalley/asr1/local/data.sh b/egs2/harpervalley/asr1/local/data.sh
new file mode 100755
index 00000000000..519dc20d399
--- /dev/null
+++ b/egs2/harpervalley/asr1/local/data.sh
@@ -0,0 +1,73 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+
+stage=1
+stop_stage=100000
+log "$0 $*"
+. utils/parse_options.sh
+
+. ./db.sh
+. ./path.sh
+. ./cmd.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+if [ -z "${HARPERVALLEY}" ]; then
+    log "Fill the value of 'HARPERVALLEY' of db.sh"
+    exit 1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    if [ ! -e "${HARPERVALLEY}/LICENSE" ]; then
+        echo "stage 1: Download data to ${HARPERVALLEY}"
+        mkdir -p ${HARPERVALLEY}
+        git clone https://github.com/cricketclub/gridspace-stanford-harper-valley.git ${HARPERVALLEY}
+    else
+        log "stage 1: ${HARPERVALLEY}/LICENSE is already existing. Skip data downloading"
+    fi
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage2 : Data Preparation"
+    if [ -n "$(ls data/tmp/)" ]; then
+        rm -r data/tmp/
+    fi
+    for file in "${HARPERVALLEY}"/data/transcript/*.json; do
+        filename=$(basename "${file%.*}")
+        dirname="${HARPERVALLEY}/data/"
+        python3 local/data_prep.py --source_dir "$dirname" \
+            --audio_dir "data/audio" \
+            --filename "$filename" \
+            --target_dir "data/tmp" \
+            --min_length 4
+    done
+    sed -i -e 's/<unk>/\[unk\]/g' data/tmp/text
+    mkdir -p data/{train,valid,test}
+    python3 local/split_data.py --source_dir "data/tmp" \
+        --min_spk_utt 10 \
+        --train_frac 0.8 \
+        --val_frac 0.1
+    for x in test valid train; do
+        for f in text wav.scp utt2spk segments; do
+            sort data/${x}/${f} -o data/${x}/${f}
+        done
+        utils/utt2spk_to_spk2utt.pl data/${x}/utt2spk > "data/${x}/spk2utt"
+        utils/validate_data_dir.sh --no-feats data/${x} || exit 1
+    done
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/harpervalley/asr1/local/data_prep.py b/egs2/harpervalley/asr1/local/data_prep.py
new file mode 100755
index 00000000000..7e404ad2249
--- /dev/null
+++ b/egs2/harpervalley/asr1/local/data_prep.py
@@ -0,0 +1,97 @@
+import argparse
+import json
+import os
+import sys
+import wave
+
+
+def load_json(f_path):
+    with open(f_path, "r") as f:
+        return json.load(f)
+
+
+def process_data(target_dir, source_dir, audio_dir, filename, min_length):
+    if not os.path.exists(target_dir):
+        os.makedirs(target_dir)
+
+    agent_wav_path = os.path.join(audio_dir, "agent", filename + ".wav")
+    caller_wav_path = os.path.join(audio_dir, "caller", filename + ".wav")
+    # exit if the wav files do not exist.
+    if not os.path.isfile(agent_wav_path) or not os.path.isfile(agent_wav_path):
+        sys.exit()
+
+    with wave.open(agent_wav_path, "rb") as wa, wave.open(caller_wav_path, "rb") as wc:
+        wa_length = wa.getnframes() / wa.getframerate()
+        wc_length = wc.getnframes() / wc.getframerate()
+
+    with open(
+        os.path.join(target_dir, "wav.scp"), "a", encoding="utf-8"
+    ) as wavscp, open(
+        os.path.join(target_dir, "utt2spk"), "a", encoding="utf-8"
+    ) as utt2spk, open(
+        os.path.join(target_dir, "segments"), "a", encoding="utf-8"
+    ) as segments, open(
+        os.path.join(target_dir, "text"), "a", encoding="utf-8"
+    ) as text:
+
+        metadata_f = load_json(os.path.join(source_dir, "metadata", filename + ".json"))
+        transcript_f = load_json(
+            os.path.join(source_dir, "transcript", filename + ".json")
+        )
+
+        agent_spk_id = metadata_f["agent"]["speaker_id"]
+        caller_spk_id = metadata_f["caller"]["speaker_id"]
+        task_type = metadata_f["tasks"][0]["task_type"].replace(" ", "_")
+        agent_rec_id = "{}-{}".format(agent_spk_id, filename)
+        caller_rec_id = "{}-{}".format(caller_spk_id, filename)
+
+        agent_utt_num = 0
+        caller_utt_num = 0
+        for v in transcript_f:
+            transcript = v["human_transcript"]
+            # Throw away utterances with < min_length words or 100 ms
+            if len(transcript.split()) < min_length or int(v["duration_ms"]) < 100:
+                continue
+            begin_ms = int(v["offset_ms"])
+            end_ms = begin_ms + int(v["duration_ms"])
+            begin_sec = begin_ms / 1000
+            end_sec = end_ms / 1000
+            if v["speaker_role"] == "agent":
+                if end_sec > wa_length:
+                    continue
+                utt_id = "{}_{}_{}".format(agent_rec_id, begin_ms, end_ms)
+                utt2spk.write("{} {}\n".format(utt_id, agent_spk_id))
+                segments.write(
+                    "{} {} {} {}\n".format(utt_id, agent_rec_id, begin_sec, end_sec)
+                )
+                agent_utt_num += 1
+            else:
+                if end_sec > wc_length:
+                    continue
+                utt_id = "{}_{}_{}".format(caller_rec_id, begin_ms, end_ms)
+                utt2spk.write("{} {}\n".format(utt_id, caller_spk_id))
+                segments.write(
+                    "{} {} {} {}\n".format(utt_id, caller_rec_id, begin_sec, end_sec)
+                )
+                caller_utt_num += 1
+            text.write("{} {} {}\n".format(utt_id, task_type, transcript))
+
+        # write wav.scp only if utterances exist
+        if agent_utt_num > 0:
+            wavscp.write("{} {}\n".format(agent_rec_id, agent_wav_path))
+        if caller_utt_num > 0:
+            wavscp.write("{} {}\n".format(caller_rec_id, caller_wav_path))
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--target_dir", type=str, default="data/tmp")
+parser.add_argument("--source_dir", type=str, required=True, help="Path to source data")
+parser.add_argument("--audio_dir", type=str, required=True, help="Path to audio data")
+parser.add_argument("--filename", type=str, required=True, help="filename")
+parser.add_argument("--min_length", type=int, default=4)
+
+args = parser.parse_args()
+
+process_data(
+    args.target_dir, args.source_dir, args.audio_dir, args.filename, args.min_length
+)
diff --git a/egs2/harpervalley/asr1/local/path.sh b/egs2/harpervalley/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/harpervalley/asr1/local/score.py b/egs2/harpervalley/asr1/local/score.py
new file mode 120000
index 00000000000..673506176eb
--- /dev/null
+++ b/egs2/harpervalley/asr1/local/score.py
@@ -0,0 +1 @@
+../../../slurp/asr1/local/score.py
\ No newline at end of file
diff --git a/egs2/harpervalley/asr1/local/score.sh b/egs2/harpervalley/asr1/local/score.sh
new file mode 120000
index 00000000000..eee438437a6
--- /dev/null
+++ b/egs2/harpervalley/asr1/local/score.sh
@@ -0,0 +1 @@
+../../../slurp/asr1/local/score.sh
\ No newline at end of file
diff --git a/egs2/harpervalley/asr1/local/split_data.py b/egs2/harpervalley/asr1/local/split_data.py
new file mode 100644
index 00000000000..9b75ec72adb
--- /dev/null
+++ b/egs2/harpervalley/asr1/local/split_data.py
@@ -0,0 +1,125 @@
+import argparse
+import os
+import random
+
+
+def get_spk_list(utt2spk_path, min_spk_utt=10, train_frac=0.8, val_frac=0.1):
+    # collect spk_id and count
+    with open(utt2spk_path, "r", encoding="utf-8") as utt2spk:
+        spk_dict = {}
+        for line in utt2spk.readlines():
+            spk_id = line.strip().split()[1]
+            if spk_id in spk_dict:
+                spk_dict[spk_id] += 1
+            else:
+                spk_dict[spk_id] = 1
+
+    # create spk_list
+    spk_list = []
+    for spk_id, count in spk_dict.items():
+        if count > min_spk_utt:
+            spk_list.append(spk_id)
+
+    # split spk_list
+    spk_list = sorted(spk_list)
+    num_speaker = len(spk_list)
+    num_train = int(train_frac * num_speaker)
+    num_val = int(val_frac * num_speaker)
+    random.seed(42)
+    random.shuffle(spk_list)
+
+    train_spk_list = spk_list[:num_train]
+    dev_spk_list = spk_list[num_train : num_train + num_val]
+    test_spk_list = spk_list[num_train + num_val :]
+
+    return train_spk_list, dev_spk_list, test_spk_list
+
+
+def split_files(source_dir, min_spk_utt, train_frac, val_frac):
+    wavscp_train = open("data/train/wav.scp", "w", encoding="utf-8")
+    utt2spk_train = open("data/train/utt2spk", "w", encoding="utf-8")
+    segments_train = open("data/train/segments", "w", encoding="utf-8")
+    text_train = open("data/train/text", "w", encoding="utf-8")
+    wavscp_dev = open("data/valid/wav.scp", "w", encoding="utf-8")
+    utt2spk_dev = open("data/valid/utt2spk", "w", encoding="utf-8")
+    segments_dev = open("data/valid/segments", "w", encoding="utf-8")
+    text_dev = open("data/valid/text", "w", encoding="utf-8")
+    wavscp_test = open("data/test/wav.scp", "w", encoding="utf-8")
+    utt2spk_test = open("data/test/utt2spk", "w", encoding="utf-8")
+    segments_test = open("data/test/segments", "w", encoding="utf-8")
+    text_test = open("data/test/text", "w", encoding="utf-8")
+
+    train_spk_list, dev_spk_list, test_spk_list = get_spk_list(
+        os.path.join(source_dir, "utt2spk"), min_spk_utt, train_frac, val_frac
+    )
+
+    # split wav.scp
+    with open(os.path.join(source_dir, "wav.scp"), "r", encoding="utf-8") as wavscp:
+        for line in wavscp.readlines():
+            rec_id = line.strip().split()[0]
+            spk_id = rec_id.split("-")[0]
+            if spk_id in train_spk_list:
+                wavscp_train.write("{}\n".format(line.strip()))
+            elif spk_id in dev_spk_list:
+                wavscp_dev.write("{}\n".format(line.strip()))
+            elif spk_id in test_spk_list:
+                wavscp_test.write("{}\n".format(line.strip()))
+
+    # split utt2spk
+    with open(os.path.join(source_dir, "utt2spk"), "r", encoding="utf-8") as utt2spk:
+        for line in utt2spk.readlines():
+            spk_id = line.strip().split()[1]
+            if spk_id in train_spk_list:
+                utt2spk_train.write("{}\n".format(line.strip()))
+            elif spk_id in dev_spk_list:
+                utt2spk_dev.write("{}\n".format(line.strip()))
+            elif spk_id in test_spk_list:
+                utt2spk_test.write("{}\n".format(line.strip()))
+
+    # split segments
+    with open(os.path.join(source_dir, "segments"), "r", encoding="utf-8") as segments:
+        for line in segments.readlines():
+            utt_id = line.strip().split()[0]
+            spk_id = utt_id.split("-")[0]
+            if spk_id in train_spk_list:
+                segments_train.write("{}\n".format(line.strip()))
+            elif spk_id in dev_spk_list:
+                segments_dev.write("{}\n".format(line.strip()))
+            elif spk_id in test_spk_list:
+                segments_test.write("{}\n".format(line.strip()))
+
+    # split text
+    with open(os.path.join(source_dir, "text"), "r", encoding="utf-8") as text:
+        for line in text.readlines():
+            utt_id = line.strip().split()[0]
+            spk_id = utt_id.split("-")[0]
+            if spk_id in train_spk_list:
+                text_train.write("{}\n".format(line.strip()))
+            elif spk_id in dev_spk_list:
+                text_dev.write("{}\n".format(line.strip()))
+            elif spk_id in test_spk_list:
+                text_test.write("{}\n".format(line.strip()))
+
+    wavscp_train.close()
+    utt2spk_train.close()
+    segments_train.close()
+    text_train.close()
+    wavscp_dev.close()
+    utt2spk_dev.close()
+    segments_dev.close()
+    text_dev.close()
+    wavscp_test.close()
+    utt2spk_test.close()
+    segments_test.close()
+    text_test.close()
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--source_dir", type=str, default="data/tmp")
+parser.add_argument("--min_spk_utt", type=int, default=10)
+parser.add_argument("--train_frac", type=float, default=0.8)
+parser.add_argument("--val_frac", type=float, default=0.1)
+
+args = parser.parse_args()
+
+split_files(args.source_dir, args.min_spk_utt, args.train_frac, args.val_frac)
diff --git a/egs2/harpervalley/asr1/path.sh b/egs2/harpervalley/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/harpervalley/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/harpervalley/asr1/pyscripts b/egs2/harpervalley/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/harpervalley/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/harpervalley/asr1/run.sh b/egs2/harpervalley/asr1/run.sh
new file mode 100755
index 00000000000..a35d91ec261
--- /dev/null
+++ b/egs2/harpervalley/asr1/run.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set="train"
+valid_set="valid"
+test_sets="test valid"
+
+asr_config=conf/train_asr.yaml
+
+./asr.sh \
+    --lang en \
+    --ngpu 1 \
+    --use_lm false \
+    --nbpe 5000 \
+    --bpe_nlsyms "[unk]" \
+    --token_type word\
+    --audio_format flac\
+    --feats_type raw\
+    --max_wav_duration 30 \
+    --inference_asr_model valid.acc.ave_10best.pth\
+    --asr_config "${asr_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" "$@"
\ No newline at end of file
diff --git a/egs2/harpervalley/asr1/scripts b/egs2/harpervalley/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/harpervalley/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/harpervalley/asr1/steps b/egs2/harpervalley/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/harpervalley/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/harpervalley/asr1/utils b/egs2/harpervalley/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/harpervalley/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/how2/asr1/cmd.sh b/egs2/how2/asr1/cmd.sh
deleted file mode 100644
index 2aae6919fef..00000000000
--- a/egs2/how2/asr1/cmd.sh
+++ /dev/null
@@ -1,110 +0,0 @@
-# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
-# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
-# e.g.
-#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
-#
-# Options:
-#   --time <time>: Limit the maximum time to execute.
-#   --mem <mem>: Limit the maximum memory usage.
-#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
-#   --num-threads <ngpu>: Specify the number of CPU core.
-#   --gpu <ngpu>: Specify the number of GPU devices.
-#   --config: Change the configuration file from default.
-#
-# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
-# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
-# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
-# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
-#
-# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
-# These options are mapping to specific options for each backend and
-# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
-# If jobs failed, your configuration might be wrong for your environment.
-#
-#
-# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
-#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
-# =========================================================~
-
-
-# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
-cmd_backend='local'
-
-# Local machine, without any Job scheduling system
-if [ "${cmd_backend}" = local ]; then
-
-    # The other usage
-    export train_cmd="run.pl"
-    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
-    export cuda_cmd="run.pl"
-    # Used for "*_recog.py"
-    export decode_cmd="run.pl"
-
-# Local machine logging to stdout and log file, without any Job scheduling system
-elif [ "${cmd_backend}" = stdout ]; then
-
-    # The other usage
-    export train_cmd="stdout.pl"
-    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
-    export cuda_cmd="stdout.pl"
-    # Used for "*_recog.py"
-    export decode_cmd="stdout.pl"
-
-
-# "qsub" (Sun Grid Engine, or derivation of it)
-elif [ "${cmd_backend}" = sge ]; then
-    # The default setting is written in conf/queue.conf.
-    # You must change "-q g.q" for the "queue" for your environment.
-    # To know the "queue" names, type "qhost -q"
-    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
-
-    export train_cmd="queue.pl"
-    export cuda_cmd="queue.pl"
-    export decode_cmd="queue.pl"
-
-
-# "qsub" (Torque/PBS.)
-elif [ "${cmd_backend}" = pbs ]; then
-    # The default setting is written in conf/pbs.conf.
-
-    export train_cmd="pbs.pl"
-    export cuda_cmd="pbs.pl"
-    export decode_cmd="pbs.pl"
-
-
-# "sbatch" (Slurm)
-elif [ "${cmd_backend}" = slurm ]; then
-    # The default setting is written in conf/slurm.conf.
-    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
-    # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
-    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
-
-    export train_cmd="slurm.pl"
-    export cuda_cmd="slurm.pl"
-    export decode_cmd="slurm.pl"
-
-elif [ "${cmd_backend}" = ssh ]; then
-    # You have to create ".queue/machines" to specify the host to execute jobs.
-    # e.g. .queue/machines
-    #   host1
-    #   host2
-    #   host3
-    # Assuming you can login them without any password, i.e. You have to set ssh keys.
-
-    export train_cmd="ssh.pl"
-    export cuda_cmd="ssh.pl"
-    export decode_cmd="ssh.pl"
-
-# This is an example of specifying several unique options in the JHU CLSP cluster setup.
-# Users can modify/add their own command options according to their cluster environments.
-elif [ "${cmd_backend}" = jhu ]; then
-
-    export train_cmd="queue.pl --mem 2G"
-    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
-    export decode_cmd="queue.pl --mem 4G"
-
-else
-    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
-    return 1
-fi
diff --git a/egs2/how2/asr1/cmd.sh b/egs2/how2/asr1/cmd.sh
new file mode 120000
index 00000000000..f77e339f822
--- /dev/null
+++ b/egs2/how2/asr1/cmd.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/cmd.sh
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/README.md b/egs2/how2_2000h/asr1/README.md
new file mode 100644
index 00000000000..ce6fe0a19db
--- /dev/null
+++ b/egs2/how2_2000h/asr1/README.md
@@ -0,0 +1,28 @@
+## End to End Speech Recognition with How2-2000h
+
+
+HowTo 2000h fbank-pitch features have been released to enable reproduction of this recipe. 
+
+
+# Results on ASR
+
+
+## asr_base_conformer_lf_mix
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_model_valid.acc.best/dev5_test|3016|55215|93.1|4.8|2.1|1.9|8.8|56.7|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_model_valid.acc.best/dev5_test|3016|276377|97.1|1.1|1.9|1.9|4.8|56.7|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_model_valid.acc.best/dev5_test|3016|82484|94.1|3.5|2.4|2.2|8.0|56.7|
diff --git a/egs2/how2_2000h/asr1/asr.sh b/egs2/how2_2000h/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/how2_2000h/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/cmd.sh b/egs2/how2_2000h/asr1/cmd.sh
new file mode 120000
index 00000000000..f77e339f822
--- /dev/null
+++ b/egs2/how2_2000h/asr1/cmd.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/cmd.sh
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/conf/decode_asr.yaml b/egs2/how2_2000h/asr1/conf/decode_asr.yaml
new file mode 120000
index 00000000000..fa4714a6f43
--- /dev/null
+++ b/egs2/how2_2000h/asr1/conf/decode_asr.yaml
@@ -0,0 +1 @@
+tuning/decode_ctc.yaml
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/conf/fbank.conf b/egs2/how2_2000h/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/how2_2000h/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/how2_2000h/asr1/conf/pbs.conf b/egs2/how2_2000h/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/how2_2000h/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/how2_2000h/asr1/conf/pitch.conf b/egs2/how2_2000h/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/how2_2000h/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/how2_2000h/asr1/conf/queue.conf b/egs2/how2_2000h/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/how2_2000h/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/how2_2000h/asr1/conf/slurm.conf b/egs2/how2_2000h/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/how2_2000h/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/how2_2000h/asr1/conf/train_asr_conformer_lf.yaml b/egs2/how2_2000h/asr1/conf/train_asr_conformer_lf.yaml
new file mode 120000
index 00000000000..ee7d1d03dbc
--- /dev/null
+++ b/egs2/how2_2000h/asr1/conf/train_asr_conformer_lf.yaml
@@ -0,0 +1 @@
+tuning/train_asr_conformer_vid_ctc_lf.yaml
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/conf/tuning/decode.yaml b/egs2/how2_2000h/asr1/conf/tuning/decode.yaml
new file mode 100644
index 00000000000..519477f7a43
--- /dev/null
+++ b/egs2/how2_2000h/asr1/conf/tuning/decode.yaml
@@ -0,0 +1,7 @@
+beam_size: 4
+batch_size: 1
+penalty: 0.0
+minlenratio: 0.0
+maxlenratio: 0.0
+ctc_weight: 0.3
+lm_weight: 0.0
diff --git a/egs2/how2_2000h/asr1/conf/tuning/decode_ctc.yaml b/egs2/how2_2000h/asr1/conf/tuning/decode_ctc.yaml
new file mode 100755
index 00000000000..03fdd93249f
--- /dev/null
+++ b/egs2/how2_2000h/asr1/conf/tuning/decode_ctc.yaml
@@ -0,0 +1,7 @@
+beam_size: 20
+batch_size: 1
+penalty: 0.1
+minlenratio: 0.0
+maxlenratio: 0.0
+ctc_weight: 1.0
+lm_weight: 0.0
diff --git a/egs2/how2_2000h/asr1/conf/tuning/pretrain_hubert.yaml b/egs2/how2_2000h/asr1/conf/tuning/pretrain_hubert.yaml
new file mode 100644
index 00000000000..f21213f3421
--- /dev/null
+++ b/egs2/how2_2000h/asr1/conf/tuning/pretrain_hubert.yaml
@@ -0,0 +1,58 @@
+grad_clip: 5.0
+batch_type: numel
+batch_bins: 150000000
+accum_grad: 1
+max_epoch: 400
+patience: none
+# Use self-defined function for initialization
+init: xavier_uniform 
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+input_size: 768
+encoder: avhubert_pretrain
+encoder_conf:
+    output_size: 768
+    linear_units: 3072
+    attention_heads: 8
+    num_blocks: 12
+    dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    dropout_input: 0.1
+    dropout_features: 0.1
+    skip_masked: false
+    skip_nomask: false
+    mask_prob: 0.80
+    extractor_mode: default
+    conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
+    final_dim: 256
+    encoder_layerdrop: 0.05
+    feature_grad_mult: 0.1
+    untie_final_proj: true
+    label_rate: 100
+    sample_rate: 16000
+
+model_conf:
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    pred_masked_weight: 1.0
+    pred_nomask_weight: 0.0
+    loss_weights: 10.0
+
+optim: adam
+optim_conf:
+    lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 25000
+
+unused_parameters: true
+
+frontend: null
+
+normalize: null
+
+specaug: null
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/conf/tuning/train_asr_conformer_vid_ctc_lf.yaml b/egs2/how2_2000h/asr1/conf/tuning/train_asr_conformer_vid_ctc_lf.yaml
new file mode 100644
index 00000000000..606081aa9b1
--- /dev/null
+++ b/egs2/how2_2000h/asr1/conf/tuning/train_asr_conformer_vid_ctc_lf.yaml
@@ -0,0 +1,78 @@
+# This configuration requires Tesla V100-SXM2(32GB) x 8 GPUs It takes about 4 days.
+batch_type: length
+batch_bins: 60000000
+accum_grad: 10
+max_epoch: 100
+patience: none
+init: xavier_uniform 
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+log_interval: 5000
+encoder: longformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "abs_pos"
+    selfattention_layer_type: "lf_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+    attention_windows: [20,20,20,20,20,20,40,40,40,40,40,40]
+    attention_dilation: [1,1,1,1,1,1,1,1,1,1,1,1]
+    attention_mode: tvm
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 512
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.002
+scheduler: warmuplr
+scheduler_conf: 
+        warmup_steps: 25000
+        
+ctc_conf:
+        ignore_nan_grad: true
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+unused_parameters: True 
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/how2_2000h/asr1/db.sh b/egs2/how2_2000h/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/how2_2000h/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/local/data.sh b/egs2/how2_2000h/asr1/local/data.sh
new file mode 100755
index 00000000000..ffd918d7d09
--- /dev/null
+++ b/egs2/how2_2000h/asr1/local/data.sh
@@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+stage=0
+stop_stage=1
+
+. ./db.sh
+. ./path.sh
+. ./cmd.sh
+
+url_how2_2000="https://drive.google.com/file/d/1SHg7La_hflMTIm6gaCus46sn4zYqWJvb/view?usp=sharing"
+data_how2=how2_feats
+
+log "$0 $*"
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+. ./path.sh
+. ./cmd.sh
+
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "stage 0: Data download"
+
+    if [ -d ${data_how2} ]; then
+        log "$0: HowTo directory or archive already exists in ${data_how2}. Skipping download."
+    else
+        ../../../utils/download_from_google_drive.sh ${url_how2_2000} $PWD tar.gz
+        log "$0: Successfully downloaded and un-tarred how2_feats.tar.gz"
+    fi
+fi
+
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: Data preparation and verification"
+    mv how2_feats/data .
+    mv how2_feats/fbank .
+fi 
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/how2_2000h/asr1/local/data_normalization/nlsyms b/egs2/how2_2000h/asr1/local/data_normalization/nlsyms
new file mode 100644
index 00000000000..8497d1e0046
--- /dev/null
+++ b/egs2/how2_2000h/asr1/local/data_normalization/nlsyms
@@ -0,0 +1,2 @@
+(h|H)(m|M)+ hesmark
+(U|u)(m|M)+ hesmark
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/local/data_normalization/shortened b/egs2/how2_2000h/asr1/local/data_normalization/shortened
new file mode 100644
index 00000000000..305228debf3
--- /dev/null
+++ b/egs2/how2_2000h/asr1/local/data_normalization/shortened
@@ -0,0 +1,4 @@
+Dr. Doctor
+Mr. Mister
+Ms. Miss
+No.1 Number one
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/local/data_normalization/symbols b/egs2/how2_2000h/asr1/local/data_normalization/symbols
new file mode 100644
index 00000000000..915b7a23070
--- /dev/null
+++ b/egs2/how2_2000h/asr1/local/data_normalization/symbols
@@ -0,0 +1,6 @@
+¾ 3/4
+½ 1/2
+% percent
+[+] plus
+= equal
+&[^;]*; and
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/local/data_normalization/url b/egs2/how2_2000h/asr1/local/data_normalization/url
new file mode 100644
index 00000000000..ca215839a83
--- /dev/null
+++ b/egs2/how2_2000h/asr1/local/data_normalization/url
@@ -0,0 +1,5 @@
+@ at
+www[.] www dot
+[.]com[/] dot com slash
+[.]com dot com
+[.]org dot org
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/local/path.sh b/egs2/how2_2000h/asr1/local/path.sh
new file mode 100755
index 00000000000..a0b8041dfb2
--- /dev/null
+++ b/egs2/how2_2000h/asr1/local/path.sh
@@ -0,0 +1,19 @@
+# check extra module installation
+if ! python -c 'import longformer; import nlgeval; import datasets' > /dev/null; then
+    echo "Error: it seems that longformer is not installed." >&2
+    echo "Error: please install longformer as follows." >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make longformer.done" >&2
+    return 1
+fi
+if ! python -c 'import nlgeval' > /dev/null; then
+    echo "Error: it seems that nlgeval is not installed." >&2
+    echo "Error: please install nlgeval as follows." >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make longformer.done" >&2
+    return 1
+fi
+if ! python -c 'import datasets' > /dev/null; then
+    echo "Error: it seems that datasets is not installed." >&2
+    echo "Error: please install datasets as follows." >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make longformer.done" >&2
+    return 1
+fi
diff --git a/egs2/how2_2000h/asr1/path.sh b/egs2/how2_2000h/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/how2_2000h/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/pyscripts b/egs2/how2_2000h/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/how2_2000h/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/run.sh b/egs2/how2_2000h/asr1/run.sh
new file mode 100755
index 00000000000..7ff75326ed3
--- /dev/null
+++ b/egs2/how2_2000h/asr1/run.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+train_set="tr_2000h_utt"
+valid_set="cv05_utt"
+test_sets="dev5_test_utt"
+
+asr_config=conf/train_asr_conformer_lf.yaml
+inference_config=conf/decode_asr.yaml
+
+feats_type=extracted
+
+token_type=bpe
+
+nlsyms=data/nlsyms
+nbpe=1000
+bpe_nlsyms="[hes]"
+
+use_lm=false
+
+
+
+./asr.sh \
+    --lang en \
+    --feats_type ${feats_type} \
+    --token_type ${token_type} \
+    --nbpe ${nbpe} \
+    --nlsyms_txt ${nlsyms} \
+    --bpe_nlsyms ${bpe_nlsyms} \
+    --use_lm ${use_lm} \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --bpe_train_text "data/${train_set}/text" "$@"
diff --git a/egs2/how2_2000h/asr1/scripts b/egs2/how2_2000h/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/how2_2000h/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/steps b/egs2/how2_2000h/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/how2_2000h/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/utils b/egs2/how2_2000h/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/how2_2000h/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/README.md b/egs2/how2_2000h/sum1/README.md
new file mode 100644
index 00000000000..c95baf49e2c
--- /dev/null
+++ b/egs2/how2_2000h/sum1/README.md
@@ -0,0 +1,70 @@
+## End to End Speech Summarization
+
+This recipe can be used to build E2E Speech Summarization models using restricted self-attention on the HowTo corpus of instructional videos. 
+
+HowTo 2000h fbank-pitch features have been released to enable reproduction of this recipe. 
+
+Training is done in two stages, (a) ASR Pretraining, and (b) Summarization fine-tuning
+
+First run ASR pretraining as follows:
+The recipe is based on asr1
+```bash
+local/run_asr.sh --asr_tag asr_pretrain
+``` 
+Then run the finetuning on summarization using the previously trained model as the initialization
+
+```bash
+./run.sh --asr_tag sum_finetune --asr_args "--init_param exp/asr_asr_pretrain/valid.acc.ave_10best.pth:::ctc"
+```
+
+# Results on ASR
+
+
+## asr_base_conformer_lf_mix
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_model_valid.acc.best/dev5_test|3016|55215|93.1|4.8|2.1|1.9|8.8|56.7|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_model_valid.acc.best/dev5_test|3016|276377|97.1|1.1|1.9|1.9|4.8|56.7|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_model_valid.acc.best/dev5_test|3016|82484|94.1|3.5|2.4|2.2|8.0|56.7|
+
+
+
+# Results on Summarization
+
+## asr_ft_sum
+### SUMM
+- Model link: [huggingface](https://huggingface.co/espnet/roshansh_how2_asr_raw_ft_sum_valid.acc)
+- ASR config: [./conf/train_sum_conformer_lf.yaml](./conf/train_sum_conformer_lf.yaml)
+- Inference config: [./conf/decode_sum.yaml](./conf/decode_sum.yaml)
+
+|dataset|Snt|Wrd|ROUGE-1|ROUGE-2|ROUGE-L|METEOR|BERTScore|
+|---|---|---|---|---|---|---|---|
+|decode_sum_asr_model_valid.acc.best/dev5_test_sum|2127|69795|60.72|44.7|56.1|29.36|91.53|
+
+
+
+Please cite the following paper if you use this recipe:
+```Bibtex
+@misc{sharma2022speech,
+      title={Speech Summarization using Restricted Self-Attention}, 
+      author={Roshan Sharma and Shruti Palaskar and Alan W Black and Florian Metze},
+      year={2022},
+      eprint={2110.06263},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+
+```
diff --git a/egs2/how2_2000h/sum1/asr.sh b/egs2/how2_2000h/sum1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/how2_2000h/sum1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/cmd.sh b/egs2/how2_2000h/sum1/cmd.sh
new file mode 120000
index 00000000000..f77e339f822
--- /dev/null
+++ b/egs2/how2_2000h/sum1/cmd.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/cmd.sh
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/conf/decode_asr.yaml b/egs2/how2_2000h/sum1/conf/decode_asr.yaml
new file mode 120000
index 00000000000..fa4714a6f43
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/decode_asr.yaml
@@ -0,0 +1 @@
+tuning/decode_ctc.yaml
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/conf/decode_sum.yaml b/egs2/how2_2000h/sum1/conf/decode_sum.yaml
new file mode 120000
index 00000000000..27c573f341b
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/decode_sum.yaml
@@ -0,0 +1 @@
+tuning/decode_sum.yaml
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/conf/fbank.conf b/egs2/how2_2000h/sum1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/how2_2000h/sum1/conf/pbs.conf b/egs2/how2_2000h/sum1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/how2_2000h/sum1/conf/pitch.conf b/egs2/how2_2000h/sum1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/how2_2000h/sum1/conf/queue.conf b/egs2/how2_2000h/sum1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/how2_2000h/sum1/conf/slurm.conf b/egs2/how2_2000h/sum1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/how2_2000h/sum1/conf/train_asr_conformer_lf.yaml b/egs2/how2_2000h/sum1/conf/train_asr_conformer_lf.yaml
new file mode 120000
index 00000000000..ee7d1d03dbc
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/train_asr_conformer_lf.yaml
@@ -0,0 +1 @@
+tuning/train_asr_conformer_vid_ctc_lf.yaml
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/conf/train_sum_conformer_lf.yaml b/egs2/how2_2000h/sum1/conf/train_sum_conformer_lf.yaml
new file mode 120000
index 00000000000..ba6ab56ca56
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/train_sum_conformer_lf.yaml
@@ -0,0 +1 @@
+tuning/train_asr_conformer_vid_lf.yaml
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/conf/tuning/decode.yaml b/egs2/how2_2000h/sum1/conf/tuning/decode.yaml
new file mode 100644
index 00000000000..519477f7a43
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/tuning/decode.yaml
@@ -0,0 +1,7 @@
+beam_size: 4
+batch_size: 1
+penalty: 0.0
+minlenratio: 0.0
+maxlenratio: 0.0
+ctc_weight: 0.3
+lm_weight: 0.0
diff --git a/egs2/how2_2000h/sum1/conf/tuning/decode_ctc.yaml b/egs2/how2_2000h/sum1/conf/tuning/decode_ctc.yaml
new file mode 100755
index 00000000000..03fdd93249f
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/tuning/decode_ctc.yaml
@@ -0,0 +1,7 @@
+beam_size: 20
+batch_size: 1
+penalty: 0.1
+minlenratio: 0.0
+maxlenratio: 0.0
+ctc_weight: 1.0
+lm_weight: 0.0
diff --git a/egs2/how2_2000h/sum1/conf/tuning/decode_sum.yaml b/egs2/how2_2000h/sum1/conf/tuning/decode_sum.yaml
new file mode 100755
index 00000000000..4682af74153
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/tuning/decode_sum.yaml
@@ -0,0 +1,7 @@
+beam_size: 4
+batch_size: 1
+penalty: 0.1
+minlenratio: 0.01
+maxlenratio: 0.2
+ctc_weight: 0.0
+lm_weight: 0.0
diff --git a/egs2/how2_2000h/sum1/conf/tuning/pretrain_hubert.yaml b/egs2/how2_2000h/sum1/conf/tuning/pretrain_hubert.yaml
new file mode 100644
index 00000000000..f21213f3421
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/tuning/pretrain_hubert.yaml
@@ -0,0 +1,58 @@
+grad_clip: 5.0
+batch_type: numel
+batch_bins: 150000000
+accum_grad: 1
+max_epoch: 400
+patience: none
+# Use self-defined function for initialization
+init: xavier_uniform 
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+input_size: 768
+encoder: avhubert_pretrain
+encoder_conf:
+    output_size: 768
+    linear_units: 3072
+    attention_heads: 8
+    num_blocks: 12
+    dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    dropout_input: 0.1
+    dropout_features: 0.1
+    skip_masked: false
+    skip_nomask: false
+    mask_prob: 0.80
+    extractor_mode: default
+    conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
+    final_dim: 256
+    encoder_layerdrop: 0.05
+    feature_grad_mult: 0.1
+    untie_final_proj: true
+    label_rate: 100
+    sample_rate: 16000
+
+model_conf:
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    pred_masked_weight: 1.0
+    pred_nomask_weight: 0.0
+    loss_weights: 10.0
+
+optim: adam
+optim_conf:
+    lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 25000
+
+unused_parameters: true
+
+frontend: null
+
+normalize: null
+
+specaug: null
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/conf/tuning/train_asr_conformer_vid_ctc_lf.yaml b/egs2/how2_2000h/sum1/conf/tuning/train_asr_conformer_vid_ctc_lf.yaml
new file mode 100644
index 00000000000..606081aa9b1
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/tuning/train_asr_conformer_vid_ctc_lf.yaml
@@ -0,0 +1,78 @@
+# This configuration requires Tesla V100-SXM2(32GB) x 8 GPUs It takes about 4 days.
+batch_type: length
+batch_bins: 60000000
+accum_grad: 10
+max_epoch: 100
+patience: none
+init: xavier_uniform 
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+log_interval: 5000
+encoder: longformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "abs_pos"
+    selfattention_layer_type: "lf_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+    attention_windows: [20,20,20,20,20,20,40,40,40,40,40,40]
+    attention_dilation: [1,1,1,1,1,1,1,1,1,1,1,1]
+    attention_mode: tvm
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 512
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.002
+scheduler: warmuplr
+scheduler_conf: 
+        warmup_steps: 25000
+        
+ctc_conf:
+        ignore_nan_grad: true
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+unused_parameters: True 
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/how2_2000h/sum1/conf/tuning/train_asr_conformer_vid_lf.yaml b/egs2/how2_2000h/sum1/conf/tuning/train_asr_conformer_vid_lf.yaml
new file mode 100644
index 00000000000..f0454207ee5
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/tuning/train_asr_conformer_vid_lf.yaml
@@ -0,0 +1,80 @@
+# This configuration requires Tesla V100-SXM2(32GB) x 8 GPUs It takes about 2 days.
+batch_bins: 200000
+batch_type: length
+accum_grad: 10
+max_epoch: 100
+patience: 10
+init: xavier_uniform
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+log_interval: 5000
+encoder: longformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d2
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "abs_pos"
+    selfattention_layer_type: "lf_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+    attention_windows: [40,40,40,40,40,40,40,40,60,60,60,60]
+    attention_dilation: [1,1,1,1,1,1,1,1,1,1,1,1]
+    attention_mode: tvm
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 512
+    num_blocks: 6
+    dropout_rate: 0.15
+    positional_dropout_rate: 0.15
+    self_attention_dropout_rate: 0.15
+    src_attention_dropout_rate: 0.15
+
+model_conf:
+    ctc_weight: 0.0
+    lsm_weight: 0.15
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.0001
+scheduler: reducelronplateau
+scheduler_conf:
+        mode: min
+        factor: 0.5
+        patience: 1
+    #scheduler: warmuplr
+    #scheduler_conf: 
+    #    warmup_steps: 25000
+        
+ctc_conf:
+        ignore_nan_grad: true
+
+unused_parameters: True 
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/how2_2000h/sum1/db.sh b/egs2/how2_2000h/sum1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/how2_2000h/sum1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/local/data.sh b/egs2/how2_2000h/sum1/local/data.sh
new file mode 100755
index 00000000000..ffd918d7d09
--- /dev/null
+++ b/egs2/how2_2000h/sum1/local/data.sh
@@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+stage=0
+stop_stage=1
+
+. ./db.sh
+. ./path.sh
+. ./cmd.sh
+
+url_how2_2000="https://drive.google.com/file/d/1SHg7La_hflMTIm6gaCus46sn4zYqWJvb/view?usp=sharing"
+data_how2=how2_feats
+
+log "$0 $*"
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+. ./path.sh
+. ./cmd.sh
+
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "stage 0: Data download"
+
+    if [ -d ${data_how2} ]; then
+        log "$0: HowTo directory or archive already exists in ${data_how2}. Skipping download."
+    else
+        ../../../utils/download_from_google_drive.sh ${url_how2_2000} $PWD tar.gz
+        log "$0: Successfully downloaded and un-tarred how2_feats.tar.gz"
+    fi
+fi
+
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: Data preparation and verification"
+    mv how2_feats/data .
+    mv how2_feats/fbank .
+fi 
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/how2_2000h/sum1/local/data_normalization/nlsyms b/egs2/how2_2000h/sum1/local/data_normalization/nlsyms
new file mode 100644
index 00000000000..8497d1e0046
--- /dev/null
+++ b/egs2/how2_2000h/sum1/local/data_normalization/nlsyms
@@ -0,0 +1,2 @@
+(h|H)(m|M)+ hesmark
+(U|u)(m|M)+ hesmark
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/local/data_normalization/shortened b/egs2/how2_2000h/sum1/local/data_normalization/shortened
new file mode 100644
index 00000000000..305228debf3
--- /dev/null
+++ b/egs2/how2_2000h/sum1/local/data_normalization/shortened
@@ -0,0 +1,4 @@
+Dr. Doctor
+Mr. Mister
+Ms. Miss
+No.1 Number one
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/local/data_normalization/symbols b/egs2/how2_2000h/sum1/local/data_normalization/symbols
new file mode 100644
index 00000000000..915b7a23070
--- /dev/null
+++ b/egs2/how2_2000h/sum1/local/data_normalization/symbols
@@ -0,0 +1,6 @@
+¾ 3/4
+½ 1/2
+% percent
+[+] plus
+= equal
+&[^;]*; and
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/local/data_normalization/url b/egs2/how2_2000h/sum1/local/data_normalization/url
new file mode 100644
index 00000000000..ca215839a83
--- /dev/null
+++ b/egs2/how2_2000h/sum1/local/data_normalization/url
@@ -0,0 +1,5 @@
+@ at
+www[.] www dot
+[.]com[/] dot com slash
+[.]com dot com
+[.]org dot org
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/local/path.sh b/egs2/how2_2000h/sum1/local/path.sh
new file mode 100755
index 00000000000..a0b8041dfb2
--- /dev/null
+++ b/egs2/how2_2000h/sum1/local/path.sh
@@ -0,0 +1,19 @@
+# check extra module installation
+if ! python -c 'import longformer; import nlgeval; import datasets' > /dev/null; then
+    echo "Error: it seems that longformer is not installed." >&2
+    echo "Error: please install longformer as follows." >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make longformer.done" >&2
+    return 1
+fi
+if ! python -c 'import nlgeval' > /dev/null; then
+    echo "Error: it seems that nlgeval is not installed." >&2
+    echo "Error: please install nlgeval as follows." >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make longformer.done" >&2
+    return 1
+fi
+if ! python -c 'import datasets' > /dev/null; then
+    echo "Error: it seems that datasets is not installed." >&2
+    echo "Error: please install datasets as follows." >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make longformer.done" >&2
+    return 1
+fi
diff --git a/egs2/how2_2000h/sum1/local/run_asr.sh b/egs2/how2_2000h/sum1/local/run_asr.sh
new file mode 120000
index 00000000000..8d5b78f2cf0
--- /dev/null
+++ b/egs2/how2_2000h/sum1/local/run_asr.sh
@@ -0,0 +1 @@
+../../asr1/run.sh
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/local/score.sh b/egs2/how2_2000h/sum1/local/score.sh
new file mode 100755
index 00000000000..da549ebcc62
--- /dev/null
+++ b/egs2/how2_2000h/sum1/local/score.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+# Copyright 2021 Carnegie Mellon University (Author : Roshan Sharma)
+
+## begin configuration section.
+data=data/dev5_test_sum
+# end configuration section.
+
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+
+if [ $# -lt 1 ]; then
+  echo "Usage: local/score.sh <asr-exp-dir>"
+  exit 1;
+fi
+
+
+asr_expdir=$1
+
+name=$(basename ${data}) # e.g. dev5_test
+echo "${asr_expdir}/decode_*/${name}"
+for dir in ${asr_expdir}/decode_*/${name}; do
+    python pyscripts/utils/score_summarization.py $data/text $dir/text $(echo $dir | sed 's/exp//g') > $dir/result.sum
+done   
diff --git a/egs2/how2_2000h/sum1/path.sh b/egs2/how2_2000h/sum1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/how2_2000h/sum1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/pyscripts b/egs2/how2_2000h/sum1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/how2_2000h/sum1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/run.sh b/egs2/how2_2000h/sum1/run.sh
new file mode 100755
index 00000000000..5acfc2abc59
--- /dev/null
+++ b/egs2/how2_2000h/sum1/run.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
+train_set="tr_2000h_sum"
+valid_set="cv05_sum"
+test_sets="dev5_test_sum"
+asr_config=conf/train_sum_conformer_lf.yaml
+inference_config=conf/decode_sum.yaml
+
+feats_type=extracted
+
+token_type=bpe
+
+nlsyms=data/nlsyms
+nbpe=1000
+bpe_nlsyms="[hes]"
+
+use_lm=false
+mdur=100
+
+## Run local/run_asr.sh to pretrain an ASR Model on How2, and fine-tune that model on Speech Summarization
+
+./asr.sh \
+    --lang en \
+    --feats_type ${feats_type} \
+    --token_type ${token_type} \
+    --nbpe ${nbpe} \
+    --nlsyms_txt ${nlsyms} \
+    --bpe_nlsyms ${bpe_nlsyms} \
+    --use_lm ${use_lm} \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --max_wav_duration "$mdur" \
+    --bpe_train_text "data/${train_set}/text" "$@"
diff --git a/egs2/how2_2000h/sum1/scripts b/egs2/how2_2000h/sum1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/how2_2000h/sum1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/steps b/egs2/how2_2000h/sum1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/how2_2000h/sum1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/utils b/egs2/how2_2000h/sum1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/how2_2000h/sum1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/hub4_spanish/asr1/asr.sh b/egs2/hub4_spanish/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/hub4_spanish/asr1/cmd.sh b/egs2/hub4_spanish/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/hub4_spanish/asr1/conf/decode_asr.yaml b/egs2/hub4_spanish/asr1/conf/decode_asr.yaml
new file mode 120000
index 00000000000..f3f59d5ac2b
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/conf/decode_asr.yaml
@@ -0,0 +1 @@
+tuning/decode_transformer.yaml
\ No newline at end of file
diff --git a/egs2/hub4_spanish/asr1/conf/fbank.conf b/egs2/hub4_spanish/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/hub4_spanish/asr1/conf/pbs.conf b/egs2/hub4_spanish/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/hub4_spanish/asr1/conf/pitch.conf b/egs2/hub4_spanish/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/hub4_spanish/asr1/conf/queue.conf b/egs2/hub4_spanish/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/hub4_spanish/asr1/conf/slurm.conf b/egs2/hub4_spanish/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/hub4_spanish/asr1/conf/train_asr.yaml b/egs2/hub4_spanish/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..00ea2f24791
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+tuning/train_asr_transformer.yaml
\ No newline at end of file
diff --git a/egs2/hub4_spanish/asr1/conf/train_lm.yaml b/egs2/hub4_spanish/asr1/conf/train_lm.yaml
new file mode 100644
index 00000000000..5bc7b7fcbfc
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/conf/train_lm.yaml
@@ -0,0 +1,14 @@
+lm_conf:
+    nlayers: 2
+    unit: 650
+optim: sgd        # or adam
+batch_type: folded
+batch_size: 64   # batch size in LM training
+max_epoch: 20      # if the data size is large, we can reduce this
+patience: 3
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
diff --git a/egs2/hub4_spanish/asr1/conf/tuning/decode_rnn.yaml b/egs2/hub4_spanish/asr1/conf/tuning/decode_rnn.yaml
new file mode 100644
index 00000000000..169f05a1ad9
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/conf/tuning/decode_rnn.yaml
@@ -0,0 +1,6 @@
+lm_weight: 0.3
+beam_size: 20
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.6
diff --git a/egs2/hub4_spanish/asr1/conf/tuning/decode_transformer.yaml b/egs2/hub4_spanish/asr1/conf/tuning/decode_transformer.yaml
new file mode 100644
index 00000000000..d89db079882
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/conf/tuning/decode_transformer.yaml
@@ -0,0 +1,7 @@
+batch_size: 1
+beam_size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.5
+lm_weight: 0.3
diff --git a/egs2/hub4_spanish/asr1/conf/tuning/train_asr_conformer.yaml b/egs2/hub4_spanish/asr1/conf/tuning/train_asr_conformer.yaml
new file mode 100644
index 00000000000..5a125de415c
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/conf/tuning/train_asr_conformer.yaml
@@ -0,0 +1,78 @@
+# network architecture
+
+# frontend related
+frontend: default
+frontend_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 160
+
+# encoder related
+encoder: conformer
+encoder_conf:
+    input_layer: conv2d
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    macaron_style: true
+    use_cnn_module: true
+    cnn_module_kernel: 15
+
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: embed
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# optimization related
+optim: adam
+accum_grad: 1
+grad_clip: 3
+max_epoch: 50
+optim_conf:
+    lr: 4.0
+scheduler: noamlr
+scheduler_conf:
+    model_size: 256
+    warmup_steps: 25000
+
+# minibatch related
+batch_type: numel
+batch_bins: 10000000
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/hub4_spanish/asr1/conf/tuning/train_asr_rnn.yaml b/egs2/hub4_spanish/asr1/conf/tuning/train_asr_rnn.yaml
new file mode 100644
index 00000000000..f55ad9d72bd
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/conf/tuning/train_asr_rnn.yaml
@@ -0,0 +1,35 @@
+# network architecture
+# encoder related
+encoder: vgg_rnn
+encoder_conf:
+    rnn_type: lstm     # encoder architecture type
+    bidirectional: True
+    use_projection: True
+    num_layers: 4
+    hidden_size: 1024
+    output_size: 1024
+
+# decoder related
+decoder: rnn
+decoder_conf:
+    num_layers: 2
+    hidden_size: 1024
+    sampling_probability: 0
+    att_conf:
+        atype: location
+        adim: 1024
+        aconv_chans: 10
+        aconv_filts: 100
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.5
+
+# minibatch related
+batch_size: 30
+
+# optimization related
+optim: adadelta
+max_epoch: 15
+patience: 3
+
diff --git a/egs2/hub4_spanish/asr1/conf/tuning/train_asr_transformer.yaml b/egs2/hub4_spanish/asr1/conf/tuning/train_asr_transformer.yaml
new file mode 100644
index 00000000000..a889e110cf3
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/conf/tuning/train_asr_transformer.yaml
@@ -0,0 +1,52 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: "conv2d"
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+
+# decoder related
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: "embed"
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# minibatch related
+batch_type: folded
+batch_size: 32
+
+# optimization related
+optim: adam
+accum_grad: 2
+grad_clip: 5
+patience: 0
+max_epoch: 100
+optim_conf:
+    lr: 1.0
+scheduler: noamlr
+scheduler_conf:
+    warmup_steps: 25000
+
+# others:
+best_model_criterion:
+-  - valid
+   - acc
+   - max
+keep_nbest_models: 10
+
+init: xavier_uniform # Our empirical studies shows that this initialization
+                     # is very important to low-resource ASR training
diff --git a/egs2/hub4_spanish/asr1/db.sh b/egs2/hub4_spanish/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/hub4_spanish/asr1/local/data.sh b/egs2/hub4_spanish/asr1/local/data.sh
new file mode 100755
index 00000000000..694d58d6608
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/local/data.sh
@@ -0,0 +1,84 @@
+#!/usr/bin/env bash
+
+# Copyright 2020 Johns Hopkins University (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+# general configuration
+stage=0       # start from 0 if you need to start from data preparation
+stop_stage=100
+SECONDS=0
+
+ . utils/parse_options.sh || exit 1;
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+mkdir -p ${HUB4_SPANISH}
+if [ -z "${HUB4_SPANISH}" ]; then
+    log "Fill the value of 'HUB4_SPANISH' of db.sh"
+    exit 1
+fi
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train
+train_dev=dev
+test_set="test"
+
+audio_data=${HUB4_SPANISH}/LDC98S74
+transcript_data=${HUB4_SPANISH}/LDC98T29
+eval_data=${HUB4_SPANISH}/LDC2001S91
+dev_list=dev.list
+
+log "data preparation started"
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then 
+    log "stage1: Prepare eval data with ${HUB4_SPANISH}"
+
+    # Eval dataset preparation
+    # prepare_data.sh does not really care about the order or number of the
+    # corpus directories
+    local/prepare_data.sh \
+      ${eval_data}/HUB4_1997NE/doc/h4ne97sp.sgm \
+      ${eval_data}/HUB4_1997NE/h4ne_sp/h4ne97sp.sph data/${test_set}
+    local/prepare_test_text.pl \
+      "<unk>" data/${test_set}/text > data/${test_set}/text.clean
+    mv data/${test_set}/text data/${test_set}/text.old
+    mv data/${test_set}/text.clean data/${test_set}/text
+    utils/fix_data_dir.sh data/${test_set}
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage1: Prepare train data with ${HUB4_SPANISH}"
+    ## Training dataset preparation
+    local/prepare_data.sh ${audio_data} ${transcript_data} data/${train_set}
+    local/prepare_training_text.pl \
+      "<unk>" data/${train_set}/text > data/${train_set}/text.clean
+    mv data/${train_set}/text data/${train_set}/text.old
+    mv data/${train_set}/text.clean data/${train_set}/text
+    utils/fix_data_dir.sh data/${train_set}
+
+    # For generating the dev set. Use provided utterance list otherwise
+    # num_dev=$(wc -l < data/eval/segments)
+    # ./utils/subset_data_dir.sh data/${train_set} ${num_dev} data/${train_dev}
+
+    ./utils/subset_data_dir.sh --utt-list ${dev_list} data/${train_set} data/${train_dev}
+
+    mv data/${train_set} data/${train_set}.tmp
+    mkdir -p data/${train_set}
+    awk '{print $1}' data/${train_dev}/segments | grep -vf - data/${train_set}.tmp/segments > data/${train_set}/uttlist.list
+    ./utils/subset_data_dir.sh --utt-list data/${train_set}/uttlist.list data/${train_set}.tmp data/${train_set}
+fi
+
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/hub4_spanish/asr1/local/parse_sgm.pl b/egs2/hub4_spanish/asr1/local/parse_sgm.pl
new file mode 120000
index 00000000000..e9c1ed6d3df
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/local/parse_sgm.pl
@@ -0,0 +1 @@
+../../../../egs/hub4_spanish/asr1/local/parse_sgm.pl
\ No newline at end of file
diff --git a/egs2/hub4_spanish/asr1/local/path.sh b/egs2/hub4_spanish/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/hub4_spanish/asr1/local/prepare_data.sh b/egs2/hub4_spanish/asr1/local/prepare_data.sh
new file mode 120000
index 00000000000..3ca46aff91e
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/local/prepare_data.sh
@@ -0,0 +1 @@
+../../../../egs/hub4_spanish/asr1/local/prepare_data.sh
\ No newline at end of file
diff --git a/egs2/hub4_spanish/asr1/local/prepare_test_text.pl b/egs2/hub4_spanish/asr1/local/prepare_test_text.pl
new file mode 120000
index 00000000000..1f940ad6e38
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/local/prepare_test_text.pl
@@ -0,0 +1 @@
+../../../../egs/hub4_spanish/asr1/local/prepare_test_text.pl
\ No newline at end of file
diff --git a/egs2/hub4_spanish/asr1/local/prepare_training_text.pl b/egs2/hub4_spanish/asr1/local/prepare_training_text.pl
new file mode 120000
index 00000000000..cde41e2580c
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/local/prepare_training_text.pl
@@ -0,0 +1 @@
+../../../../egs/hub4_spanish/asr1/local/prepare_training_text.pl
\ No newline at end of file
diff --git a/egs2/hub4_spanish/asr1/local/write_kaldi_files.pl b/egs2/hub4_spanish/asr1/local/write_kaldi_files.pl
new file mode 120000
index 00000000000..0f70002df02
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/local/write_kaldi_files.pl
@@ -0,0 +1 @@
+../../../../egs/hub4_spanish/asr1/local/write_kaldi_files.pl
\ No newline at end of file
diff --git a/egs2/hub4_spanish/asr1/path.sh b/egs2/hub4_spanish/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/hub4_spanish/asr1/pyscripts b/egs2/hub4_spanish/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/hub4_spanish/asr1/run.sh b/egs2/hub4_spanish/asr1/run.sh
new file mode 100755
index 00000000000..e7e7a8bb204
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/run.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train
+train_dev=dev
+test_set="test"
+
+asr_config=conf/train_asr.yaml
+lm_config=conf/train_lm.yaml
+inference_config=conf/decode_asr.yaml
+
+nbpe=500
+
+./asr.sh \
+    --ngpu 4 \
+    --use_lm true \
+    --lm_config "${lm_config}" \
+    --token_type bpe \
+    --nbpe $nbpe \
+    --feats_type raw \
+    --speed_perturb_factors "0.9 1.0 1.1" \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${train_dev}" \
+    --test_sets "${test_set}" \
+    --bpe_train_text "data/${train_set}/text" \
+    --lm_train_text "data/${train_set}/text" "$@"
+
diff --git a/egs2/hub4_spanish/asr1/scripts b/egs2/hub4_spanish/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/hub4_spanish/asr1/steps b/egs2/hub4_spanish/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/hub4_spanish/asr1/utils b/egs2/hub4_spanish/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/iemocap/asr1/README.md b/egs2/iemocap/asr1/README.md
index 8523412853d..aee70f8bac6 100644
--- a/egs2/iemocap/asr1/README.md
+++ b/egs2/iemocap/asr1/README.md
@@ -33,3 +33,35 @@
 |decode_asr_asr_model_valid.acc.ave_10best/test|941|11017|75.7|15.1|9.2|5.6|29.9|76.1|
 |decode_asr_asr_model_valid.acc.ave_10best/valid|390|4355|82.8|9.4|7.9|3.3|20.5|58.5|
 
+# Sentiment Analysis RESULTS
+## Environments
+- date: `Thu Feb 17 11:25:22 EST 2022`
+- python version: `3.7.11 (default, Jul 27 2021, 14:32:16)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.9.0+cu102`
+- Git hash: `f6cde1c419c814a14ccd40abe557a780508cbcdf`
+  - Commit date: `Fri Feb 11 12:25:33 2022 -0500`
+
+## Using Conformer based encoder and Transformer based decoder with spectral augmentation and predicting transcript along with sentiment
+- ASR config: [conf/tuning/train_asr_conformer.yaml](conf/tuning/train_asr_conformer.yaml)
+- token_type: word
+- Sentiment Labels: Positive, Neutral, Negative
+- Pretrained Model
+   - Hugging Face: https://huggingface.co/espnet/YushiUeda_iemocap_sentiment_asr_train_asr_conformer
+
+|dataset|Snt|Intent Classification Macro F1 (%)| Weighted F1 (%)| Micro F1 (%)|
+|---|---|---|---|---|
+|decode_asr_model_valid.acc.ave_10best/valid|754|53.9|65.7|66.4|
+|decode_asr_model_valid.acc.ave_10best/test|1650|50.3|54.5|55.7|
+
+## Using Conformer based encoder, Transformer based decoder, and self-supervised learning features with spectral augmentation and predicting transcript along with sentiment
+- ASR config: [conf/tuning/train_asr_conformer_hubert.yaml](conf/tuning/train_asr_conformer_hubert.yaml)
+- token_type: word
+- Sentiment Labels: Positive, Neutral, Negative
+- Pretrained Model
+   - Hugging Face: https://huggingface.co/espnet/YushiUeda_iemocap_sentiment_asr_train_asr_conformer_hubert
+
+|dataset|Snt|Intent Classification Macro F1 (%)| Weighted F1 (%)| Micro F1 (%)|
+|---|---|---|---|---|
+|decode_asr_model_valid.acc.ave_10best/valid|754|66.5|76.4|75.7|
+|decode_asr_model_valid.acc.ave_10best/test|1650|62.0|65.5|65.8|
\ No newline at end of file
diff --git a/egs2/iemocap/asr1/conf/tuning/train_asr_conformer.yaml b/egs2/iemocap/asr1/conf/tuning/train_asr_conformer.yaml
new file mode 100644
index 00000000000..021db5f3da6
--- /dev/null
+++ b/egs2/iemocap/asr1/conf/tuning/train_asr_conformer.yaml
@@ -0,0 +1,60 @@
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+optim: adam
+optim_conf:
+    lr: 0.0005
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 5000
+max_epoch: 200
+batch_size: 64
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
\ No newline at end of file
diff --git a/egs2/iemocap/asr1/conf/tuning/train_asr_conformer_hubert.yaml b/egs2/iemocap/asr1/conf/tuning/train_asr_conformer_hubert.yaml
new file mode 100644
index 00000000000..c6acbbb652a
--- /dev/null
+++ b/egs2/iemocap/asr1/conf/tuning/train_asr_conformer_hubert.yaml
@@ -0,0 +1,85 @@
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+optim: adam
+optim_conf:
+    lr: 0.0002
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
+max_epoch: 50
+
+freeze_param: [
+"frontend.upstream"
+]
+
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+frontend: s3prl
+frontend_conf:
+    frontend_conf:
+        upstream: hubert_large_ll60k  # Note: If the upstream is changed, please change the input_size in the preencoder.
+    download_dir: ./hub
+    multilayer_feature: True
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 80
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
\ No newline at end of file
diff --git a/egs2/iemocap/asr1/local/data.sh b/egs2/iemocap/asr1/local/data.sh
index 9128e37b8fa..de0d3807ec6 100755
--- a/egs2/iemocap/asr1/local/data.sh
+++ b/egs2/iemocap/asr1/local/data.sh
@@ -24,6 +24,12 @@ remove_emo=
 # Remove the utterances with the specified emotional labels
 # emotional labels: ang (anger), hap (happiness), exc (excitement), sad (sadness),
 # fru (frustration), fea (fear), sur (surprise), neu (neutral), and xxx (other)
+convert_to_sentiment=false
+# for sentiment (positive, negative and neutral) analysis
+# mapping from emotion to sentiment is as follows:
+# Positive: hap, exc, sur
+# Negative: ang, sad, fru, fea
+# Neutral: neu
 
 #data
 datadir=/ocean/projects/cis210027p/shared/corpora/IEMOCAP_full_release
@@ -77,9 +83,27 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
                 emo=$(grep ${utt_id} ${datadir}/Session${n}/dialog/EmoEvaluation/${ses_id}.txt \
                         | sed "s/^.*\t${utt_id}\t\([a-z]\{3\}\)\t.*$/\1/g")
                 if ! eval "echo ${remove_emo} | grep -q ${emo}" ; then
-                    echo "${utt_id} ${file}" >> data/${tmp}/wav.scp
-                    echo "${utt_id} ${utt_id}" >> data/${tmp}/utt2spk
-                    echo "${utt_id} <${emo}> ${words}" >> data/${tmp}/text
+                    # for sentiment analysis
+                    if [ ${convert_to_sentiment} = "true" ]; then
+                        words2=$(echo "$words" | perl local/prepare_sentiment.pl)
+                        if [ ${emo} = "hap" ] || [ ${emo} = "exc" ] || [ ${emo} = "sur" ]; then
+                            echo "${utt_id} Positive ${words2}" >> data/${tmp}/text
+                            echo "${utt_id} ${file}" >> data/${tmp}/wav.scp
+                            echo "${utt_id} ${utt_id}" >> data/${tmp}/utt2spk
+                        elif [ ${emo} = "ang" ] || [ ${emo} = "sad" ] || [ ${emo} = "fru" ] || [ ${emo} = "fea" ]; then
+                            echo "${utt_id} Negative ${words2}" >> data/${tmp}/text
+                            echo "${utt_id} ${file}" >> data/${tmp}/wav.scp
+                            echo "${utt_id} ${utt_id}" >> data/${tmp}/utt2spk
+                        elif [ ${emo} = "neu" ];then
+                            echo "${utt_id} Neutral ${words2}" >> data/${tmp}/text
+                            echo "${utt_id} ${file}" >> data/${tmp}/wav.scp
+                            echo "${utt_id} ${utt_id}" >> data/${tmp}/utt2spk
+                        fi
+                    else
+                        echo "${utt_id} <${emo}> ${words}" >> data/${tmp}/text
+                        echo "${utt_id} ${file}" >> data/${tmp}/wav.scp
+                        echo "${utt_id} ${utt_id}" >> data/${tmp}/utt2spk
+                    fi
                 fi
             done
         done
@@ -95,29 +119,31 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
 fi
 
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    log "stage 2: IEMOCAP Transcript Conversion"
-    mkdir -p data/{train,valid,test}/{original,tmp}/
-    for dset in train valid test; do        
-        cp data/${dset}/text -t data/${dset}/original/
-        if ${lowercase}; then
-            log "lowercase ${dset}"
-            perl local/lowercase.pl < data/${dset}/text > data/${dset}/tmp/text
+    if [ ${convert_to_sentiment} != "true" ]; then
+        log "stage 2: IEMOCAP Transcript Conversion"
+        mkdir -p data/{train,valid,test}/{original,tmp}/
+        for dset in train valid test; do        
+            cp data/${dset}/text -t data/${dset}/original/
+            if ${lowercase}; then
+                log "lowercase ${dset}"
+                perl local/lowercase.pl < data/${dset}/text > data/${dset}/tmp/text
+                cp data/${dset}/tmp/text data/${dset}/text
+            fi
+            if ${remove_punctuation}; then
+                log "remove_punctuation ${dset}"
+                perl local/remove_punctuation.pl < data/${dset}/text > data/${dset}/tmp/text
+                cp data/${dset}/tmp/text data/${dset}/text
+            fi
+            if ${remove_tag}; then
+                log "remove_tag ${dset}"
+                perl local/remove_tag.pl < data/${dset}/text > data/${dset}/tmp/text
+                cp data/${dset}/tmp/text data/${dset}/text
+            fi
+            #Remove extra space and normalize punctuation
+            perl local/normalize_punctuation.pl < data/${dset}/text > data/${dset}/tmp/text
             cp data/${dset}/tmp/text data/${dset}/text
-        fi
-        if ${remove_punctuation}; then
-            log "remove_punctuation ${dset}"
-            perl local/remove_punctuation.pl < data/${dset}/text > data/${dset}/tmp/text
-            cp data/${dset}/tmp/text data/${dset}/text
-        fi
-        if ${remove_tag}; then
-            log "remove_tag ${dset}"
-            perl local/remove_tag.pl < data/${dset}/text > data/${dset}/tmp/text
-            cp data/${dset}/tmp/text data/${dset}/text
-        fi
-        #Remove extra space and normalize punctuation
-        perl local/normalize_punctuation.pl < data/${dset}/text > data/${dset}/tmp/text
-        cp data/${dset}/tmp/text data/${dset}/text
-    done
+        done
+    fi
     for dset in test valid train; do 
         utils/validate_data_dir.sh --no-feats data/${dset} || exit 1
     done
diff --git a/egs2/iemocap/asr1/local/prepare_sentiment.pl b/egs2/iemocap/asr1/local/prepare_sentiment.pl
new file mode 100644
index 00000000000..1c6faf4baf8
--- /dev/null
+++ b/egs2/iemocap/asr1/local/prepare_sentiment.pl
@@ -0,0 +1,30 @@
+#!/usr/bin/env perl
+#
+# Yushi Ueda, Carnegie Mellon University (2022)
+
+use warnings;
+use strict;
+
+while (@ARGV) {
+    $_ = shift;
+    /^-b$/ && ($| = 1, next); # not buffered (flush each line)
+}
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+
+while(<STDIN>) {
+  # remove punctuation except apostrophes
+  s/([\.\,\?\!\-\:\;\"])/ $1 /g;
+  s/[\.\,\?\!\-\:\;\"]//g;
+  # remove tag (e.g. [LAUGHTER])
+  s/\[.+\]//g;
+  # Detect valid apostrophe cases and split those into a two words
+  s/([A-Za-z])\'([A-Za-z])/$1 \'$2/g;
+  # Clean up special cases of standalone apostrophes
+  s/([A-Za-z])\' /$1 /g;
+  # remove extra spaces
+  s/ +/ /g;
+  # lowercasing
+  print lc($_);
+}
\ No newline at end of file
diff --git a/egs2/iemocap/asr1/local/score.py b/egs2/iemocap/asr1/local/score.py
deleted file mode 100755
index 8d49fc395fb..00000000000
--- a/egs2/iemocap/asr1/local/score.py
+++ /dev/null
@@ -1,88 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2021  Siddhant Arora
-#           2021  Carnegie Mellon University
-# Apache 2.0
-
-
-import os
-import re
-import sys
-import pandas as pd
-
-
-def get_classification_result(hyp_file, ref_file, hyp_write, ref_write):
-    hyp_lines = [line for line in hyp_file]
-    ref_lines = [line for line in ref_file]
-
-    error = 0
-    for line_count in range(len(hyp_lines)):
-        hyp_intent = hyp_lines[line_count].split(" ")[0]
-        ref_intent = ref_lines[line_count].split(" ")[0]
-        if hyp_intent != ref_intent:
-            error += 1
-        hyp_write.write(" ".join(hyp_lines[line_count].split(" ")[1:]))
-        ref_write.write(" ".join(ref_lines[line_count].split(" ")[1:]))
-    return 1 - (error / len(hyp_lines))
-
-
-exp_root = sys.argv[1]
-valid_hyp_file = open(
-    os.path.join(
-        exp_root, "decode_asr_asr_model_valid.acc.ave_10best/valid/score_wer/hyp.trn"
-    )
-)
-valid_ref_file = open(
-    os.path.join(
-        exp_root, "decode_asr_asr_model_valid.acc.ave_10best/valid/score_wer/ref.trn"
-    )
-)
-valid_hyp_write = open(
-    os.path.join(
-        exp_root,
-        "decode_asr_asr_model_valid.acc.ave_10best/valid/score_wer/hyp_asr.trn",
-    ),
-    "w",
-)
-valid_ref_write = open(
-    os.path.join(
-        exp_root,
-        "decode_asr_asr_model_valid.acc.ave_10best/valid/score_wer/ref_asr.trn",
-    ),
-    "w",
-)
-
-result = get_classification_result(
-    valid_hyp_file, valid_ref_file, valid_hyp_write, valid_ref_write
-)
-print("Valid Emotion Classification Result")
-print(result)
-
-test_hyp_file = open(
-    os.path.join(
-        exp_root, "decode_asr_asr_model_valid.acc.ave_10best/test/score_wer/hyp.trn"
-    )
-)
-test_ref_file = open(
-    os.path.join(
-        exp_root, "decode_asr_asr_model_valid.acc.ave_10best/test/score_wer/ref.trn"
-    )
-)
-test_hyp_write = open(
-    os.path.join(
-        exp_root, "decode_asr_asr_model_valid.acc.ave_10best/test/score_wer/hyp_asr.trn"
-    ),
-    "w",
-)
-test_ref_write = open(
-    os.path.join(
-        exp_root, "decode_asr_asr_model_valid.acc.ave_10best/test/score_wer/ref_asr.trn"
-    ),
-    "w",
-)
-
-result = get_classification_result(
-    test_hyp_file, test_ref_file, test_hyp_write, test_ref_write
-)
-print("Test Emotion Classification Result")
-print(result)
diff --git a/egs2/iemocap/asr1/local/score.py b/egs2/iemocap/asr1/local/score.py
new file mode 120000
index 00000000000..eb4334259c5
--- /dev/null
+++ b/egs2/iemocap/asr1/local/score.py
@@ -0,0 +1 @@
+../../../slue-voxceleb/asr1/local/score.py
\ No newline at end of file
diff --git a/egs2/iemocap/asr1/local/score.sh b/egs2/iemocap/asr1/local/score.sh
deleted file mode 100755
index 11433d4f1a5..00000000000
--- a/egs2/iemocap/asr1/local/score.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2021  Siddhant Arora
-#           2021  Carnegie Mellon University
-
-# # begin configuration section.
-# cmd=run.pl
-# stage=0
-# data=data/eval2000
-# #end configuration section.
-
-[ -f ./path.sh ] && . ./path.sh
-. parse_options.sh || exit 1;
-
-if [ $# -lt 1 ]; then
-  echo "Usage: local/score.sh <asr-exp-dir>"
-  exit 1;
-fi
-
-asr_expdir=$1
-_scoredir="${asr_expdir}/decode_asr_asr_model_valid.acc.ave_10best/valid/score_wer/"
-python local/score.py ${asr_expdir}
-sclite \
-            -r "${_scoredir}ref_asr.trn" trn \
-            -h "${_scoredir}hyp_asr.trn" trn \
-            -i rm -o all stdout > "${_scoredir}result_asr.txt"
-echo "Write ASR result in ${_scoredir}result_asr.txt"
-                grep -e Avg -e SPKR -m 2 "${_scoredir}result_asr.txt"
-_scoredir="${asr_expdir}/decode_asr_asr_model_valid.acc.ave_10best/test/score_wer/"
-sclite \
-            -r "${_scoredir}ref_asr.trn" trn \
-            -h "${_scoredir}hyp_asr.trn" trn \
-            -i rm -o all stdout > "${_scoredir}result_asr.txt"
-echo "Write ASR result in ${_scoredir}result_asr.txt"
-                grep -e Avg -e SPKR -m 2 "${_scoredir}result_asr.txt"
-exit 0
-
diff --git a/egs2/iemocap/asr1/local/score.sh b/egs2/iemocap/asr1/local/score.sh
new file mode 120000
index 00000000000..938c01f1250
--- /dev/null
+++ b/egs2/iemocap/asr1/local/score.sh
@@ -0,0 +1 @@
+../../../slue-voxceleb/asr1/local/score.sh
\ No newline at end of file
diff --git a/egs2/iemocap/asr1/run.sh b/egs2/iemocap/asr1/run.sh
index 81e473cccb1..09a8a871cdc 100755
--- a/egs2/iemocap/asr1/run.sh
+++ b/egs2/iemocap/asr1/run.sh
@@ -12,7 +12,7 @@ test_sets="test valid"
 asr_config=conf/train_asr.yaml
 inference_config=conf/decode_asr.yaml
 local_data_opts="--lowercase true --remove_punctuation true --remove_emo xxx_exc_fru_fea_sur"
-# local_data_opts: 4 following options can be set (default=false)
+# local_data_opts: 5 following options can be set (default=false)
 #--lowercase
 #   Convert transcripts into lowercase if "true".
 #--remove_punctuation
@@ -24,6 +24,13 @@ local_data_opts="--lowercase true --remove_punctuation true --remove_emo xxx_exc
 #   If specifying two or more labels, concatenate them with "_" (e.g. xxx_exc_fru_fea_sur).
 #   emotional labels: ang (anger), hap (happiness), exc (excitement), sad (sadness),
 #   fru (frustration), fea (fear), sur (surprise), neu (neutral), and xxx (other)
+#--convert_to_sentiment
+#   Convert emotion to sentiment (Positive, Negative and Neutral)
+#   mapping from emotion to sentiment is as follows:
+#   Positive: hap, exc, sur
+#   Negative: ang, sad, fru, fea
+#   Neutral: neu
+#   This option normalizes text irrelevant of "--lowercase" "--remove_punctuation" "--remove_tag" options
 
 ./asr.sh \
     --lang en \
diff --git a/egs2/iwslt14/mt1/README.md b/egs2/iwslt14/mt1/README.md
new file mode 100644
index 00000000000..de18c222268
--- /dev/null
+++ b/egs2/iwslt14/mt1/README.md
@@ -0,0 +1,14 @@
+# Results
+
+## mt_train_mt_transformer_lr3e-3_warmup10k_share_enc_dec_input_dropout0.3_raw_bpe_tc10000
+- mt_config: conf/tuning/train_mt_transformer_lr3e-3_warmup10k_share_enc_dec_input_dropout0.3.yaml
+- inference_config: conf/decode_mt.yaml
+
+### BLEU
+
+Metric: BLEU-4, detokenized case-sensitive BLEU result (single-reference)
+
+|dataset|bleu_score|verbose_score|
+|---|---|---|
+|beam5_maxlenratio1.6_penalty0.2/valid|33.3|68.4/42.9/28.9/19.8 (BP = 0.924 ratio = 0.927 hyp_len = 134328 ref_len = 144976)|
+|beam5_maxlenratio1.6_penalty0.2/test|32.2|67.2/41.4/27.4/18.5 (BP = 0.933 ratio = 0.935 hyp_len = 119813 ref_len = 128122)|
diff --git a/egs2/iwslt14/mt1/cmd.sh b/egs2/iwslt14/mt1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/iwslt14/mt1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/iwslt14/mt1/conf/decode_mt.yaml b/egs2/iwslt14/mt1/conf/decode_mt.yaml
new file mode 100644
index 00000000000..6570a89920d
--- /dev/null
+++ b/egs2/iwslt14/mt1/conf/decode_mt.yaml
@@ -0,0 +1,5 @@
+beam_size: 5
+lm_weight: 0.0
+maxlenratio: 1.6
+minlenratio: 0.0
+penalty: 0.2
diff --git a/egs2/iwslt14/mt1/conf/pbs.conf b/egs2/iwslt14/mt1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/iwslt14/mt1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/iwslt14/mt1/conf/queue.conf b/egs2/iwslt14/mt1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/iwslt14/mt1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/iwslt14/mt1/conf/slurm.conf b/egs2/iwslt14/mt1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/iwslt14/mt1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/iwslt14/mt1/conf/train_mt_transformer.yaml b/egs2/iwslt14/mt1/conf/train_mt_transformer.yaml
new file mode 120000
index 00000000000..050cda0e4d0
--- /dev/null
+++ b/egs2/iwslt14/mt1/conf/train_mt_transformer.yaml
@@ -0,0 +1 @@
+tuning/train_mt_transformer_lr3e-3_warmup10k_share_enc_dec_input_dropout0.3.yaml
\ No newline at end of file
diff --git a/egs2/iwslt14/mt1/conf/tuning/train_mt_transformer_lr3e-3_warmup10k_share_enc_dec_input_dropout0.3.yaml b/egs2/iwslt14/mt1/conf/tuning/train_mt_transformer_lr3e-3_warmup10k_share_enc_dec_input_dropout0.3.yaml
new file mode 100644
index 00000000000..8b2d8844238
--- /dev/null
+++ b/egs2/iwslt14/mt1/conf/tuning/train_mt_transformer_lr3e-3_warmup10k_share_enc_dec_input_dropout0.3.yaml
@@ -0,0 +1,59 @@
+frontend: embed     # embedding + positional encoding
+frontend_conf:
+    embed_dim: 512
+    positional_dropout_rate: 0.3
+
+encoder: transformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 6
+    dropout_rate: 0.3
+    positional_dropout_rate: 0.3
+    attention_dropout_rate: 0.3
+    input_layer: null
+    normalize_before: true
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 6
+    dropout_rate: 0.3
+    positional_dropout_rate: 0.3
+    self_attention_dropout_rate: 0.3
+    src_attention_dropout_rate: 0.3
+
+model_conf:
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    share_decoder_input_output_embed: false
+    share_encoder_decoder_input_embed: true
+
+num_att_plot: 1
+log_interval: 100
+num_workers: 2
+batch_type: numel
+batch_bins: 400000000
+accum_grad: 1
+max_epoch: 200
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+    lr: 0.003
+    betas:
+    - 0.9
+    - 0.98
+    eps: 0.000000001
+    weight_decay: 0.0001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 10000
diff --git a/egs2/iwslt14/mt1/db.sh b/egs2/iwslt14/mt1/db.sh
new file mode 120000
index 00000000000..a11c0666fa1
--- /dev/null
+++ b/egs2/iwslt14/mt1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/mt1/db.sh
\ No newline at end of file
diff --git a/egs2/iwslt14/mt1/local/data.sh b/egs2/iwslt14/mt1/local/data.sh
new file mode 100755
index 00000000000..7a2ba392392
--- /dev/null
+++ b/egs2/iwslt14/mt1/local/data.sh
@@ -0,0 +1,186 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+. ./db.sh || exit 1;
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+stage=1
+stop_stage=100000
+URL="http://dl.fbaipublicfiles.com/fairseq/data/iwslt14/de-en.tgz"
+GZ=de-en.tgz
+
+log "$0 $*"
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+if [ -z "${IWSLT14}" ]; then
+    log "Fill the value of 'IWSLT14' of db.sh"
+    exit 1
+fi
+
+
+if [ -f "${IWSLT14}/${GZ}" ]; then
+    log "Data already downloaded"
+else
+    (
+        cd ${IWSLT14}
+        wget "$URL"
+        tar zxvf $GZ
+    )
+    log "Data downloaded and extracted"
+fi
+
+src=de
+tgt=en
+lang=de-en
+prep=iwslt14.tokenized.de-en
+tmp=data/$prep/tmp
+
+if [ ! -d "${IWSLT14}/${lang}" ]; then
+    (
+        cd ${IWSLT14}
+        tar zxvf $GZ
+    )
+    log "Data extracted"
+fi
+
+# check extra module installation
+if ! command -v tokenizer.perl > /dev/null; then
+    echo "Error: it seems that moses is not installed." >&2
+    echo "Error: please install moses as follows." >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make moses.done" >&2
+    exit 1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: Data Preparation"
+    mkdir -p data/train data/valid data/test $tmp
+
+
+    log "preparing test and valid data"
+
+    for l in $src $tgt; do
+        for o in "${IWSLT14}/${lang}"/IWSLT14.TED*."${l}".xml; do
+            fname=${o##*/}
+            f=$tmp/${fname%.*}
+            echo $o $f
+            grep '<seg id' $o | \
+                sed -e 's/<seg id="[0-9]*">\s*//g' | \
+                sed -e 's/\s*<\/seg>\s*//g' | \
+                sed -e "s/\’/\'/g" > $f
+            tokenizer.perl -threads 8 -l $l < $f > $f.tok 
+            lowercase.perl < $f.tok > $f.tok.lc
+            remove_punctuation.pl < $f.tok > $f.tok.rm
+            remove_punctuation.pl < $f.tok.lc > $f.tok.lc.rm
+            echo ""
+        done
+    done
+
+    log "pre-processing train data..."
+    for l in $src $tgt; do
+        f=train.tags.$lang.$l
+        tok=train.tags.$lang.tok.$l
+
+        < $IWSLT14/$lang/$f \
+        grep -v '<url>' | \
+        grep -v '<talkid>' | \
+        grep -v '<keywords>' | \
+        sed -e 's/<title>//g' | \
+        sed -e 's/<\/title>//g' | \
+        sed -e 's/<description>//g' | \
+        sed -e 's/<\/description>//g' > $tmp/$f
+        tokenizer.perl -threads 8 -l $l < $tmp/$f > $tmp/$tok
+        echo ""
+    done
+
+    log "Cleaning train data"
+    clean-corpus-n.perl -ratio 1.5 $tmp/train.tags.$lang.tok $src $tgt $tmp/train.tags.$lang.tok.clean 1 175
+    for l in $src $tgt; do
+        lowercase.perl < $tmp/train.tags.$lang.tok.clean.$l > $tmp/train.tags.$lang.tok.clean.lc.$l
+
+        remove_punctuation.pl < $tmp/train.tags.$lang.tok.clean.lc.$l > $tmp/train.tags.$lang.tok.lc.rm.$l
+        remove_punctuation.pl < $tmp/train.tags.$lang.tok.clean.$l > $tmp/train.tags.$lang.tok.rm.$l
+    done
+
+    #Clean again
+    clean-corpus-n.perl $tmp/train.tags.$lang.tok.lc.rm $src $tgt $tmp/train.tags.$lang.tok.clean.lc.rm 1 175
+    clean-corpus-n.perl $tmp/train.tags.$lang.tok.rm $src $tgt $tmp/train.tags.$lang.tok.clean.rm 1 175
+
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: Creating Splits"
+    for l in $src $tgt; do
+
+        awk '{if (NR%23 != 0)  print $0; }' $tmp/train.tags.$lang.tok.clean.$l > $tmp/train.tok.clean.$l
+        awk '{if (NR%23 == 0)  print $0; }' $tmp/train.tags.$lang.tok.clean.$l > $tmp/valid.tok.clean.$l
+
+        awk '{if (NR%23 != 0)  print $0; }' $tmp/train.tags.$lang.tok.clean.lc.$l > $tmp/train.tok.clean.lc.$l
+        awk '{if (NR%23 == 0)  print $0; }' $tmp/train.tags.$lang.tok.clean.lc.$l > $tmp/valid.tok.clean.lc.$l
+
+        awk '{if (NR%23 != 0)  print $0; }' $tmp/train.tags.$lang.tok.clean.lc.rm.$l > $tmp/train.tok.clean.lc.rm.$l
+        awk '{if (NR%23 == 0)  print $0; }' $tmp/train.tags.$lang.tok.clean.lc.rm.$l > $tmp/valid.tok.clean.lc.rm.$l
+
+        awk '{if (NR%23 != 0)  print $0; }' $tmp/train.tags.$lang.tok.clean.rm.$l > $tmp/train.tok.clean.rm.$l
+        awk '{if (NR%23 == 0)  print $0; }' $tmp/train.tags.$lang.tok.clean.rm.$l > $tmp/valid.tok.clean.rm.$l
+
+        cat $tmp/IWSLT14.TED.dev2010.$lang.$l.tok \
+            $tmp/IWSLT14.TEDX.dev2012.$lang.$l.tok \
+            $tmp/IWSLT14.TED.tst2010.$lang.$l.tok \
+            $tmp/IWSLT14.TED.tst2011.$lang.$l.tok \
+            $tmp/IWSLT14.TED.tst2012.$lang.$l.tok \
+            > $tmp/test.tok.$l
+
+        cat $tmp/IWSLT14.TED.dev2010.$lang.$l.tok.lc \
+            $tmp/IWSLT14.TEDX.dev2012.$lang.$l.tok.lc \
+            $tmp/IWSLT14.TED.tst2010.$lang.$l.tok.lc \
+            $tmp/IWSLT14.TED.tst2011.$lang.$l.tok.lc \
+            $tmp/IWSLT14.TED.tst2012.$lang.$l.tok.lc \
+            > $tmp/test.tok.lc.$l
+
+        cat $tmp/IWSLT14.TED.dev2010.$lang.$l.tok.lc.rm \
+            $tmp/IWSLT14.TEDX.dev2012.$lang.$l.tok.lc.rm \
+            $tmp/IWSLT14.TED.tst2010.$lang.$l.tok.lc.rm \
+            $tmp/IWSLT14.TED.tst2011.$lang.$l.tok.lc.rm \
+            $tmp/IWSLT14.TED.tst2012.$lang.$l.tok.lc.rm \
+            > $tmp/test.tok.lc.rm.$l
+
+        cat $tmp/IWSLT14.TED.dev2010.$lang.$l.tok.rm \
+            $tmp/IWSLT14.TEDX.dev2012.$lang.$l.tok.rm \
+            $tmp/IWSLT14.TED.tst2010.$lang.$l.tok.rm \
+            $tmp/IWSLT14.TED.tst2011.$lang.$l.tok.rm \
+            $tmp/IWSLT14.TED.tst2012.$lang.$l.tok.rm \
+            > $tmp/test.tok.rm.$l
+
+        nl -s ' ' -n rz $tmp/train.tok.clean.$l | awk '{print "utt" $0}' > data/train/text.tc.$l
+        nl -s ' ' -n rz $tmp/train.tok.clean.rm.$l | awk '{print "utt" $0}' > data/train/text.tc.rm.$l
+        nl -s ' ' -n rz $tmp/train.tok.clean.lc.$l | awk '{print "utt" $0}' > data/train/text.lc.$l
+        nl -s ' ' -n rz $tmp/train.tok.clean.lc.rm.$l | awk '{print "utt" $0}' > data/train/text.lc.rm.$l
+
+        nl -s ' ' -n rz $tmp/valid.tok.clean.$l | awk '{print "utt" $0}' > data/valid/text.tc.$l
+        nl -s ' ' -n rz $tmp/valid.tok.clean.rm.$l | awk '{print "utt" $0}' > data/valid/text.tc.rm.$l
+        nl -s ' ' -n rz $tmp/valid.tok.clean.lc.$l | awk '{print "utt" $0}' > data/valid/text.lc.$l
+        nl -s ' ' -n rz $tmp/valid.tok.clean.lc.rm.$l | awk '{print "utt" $0}' > data/valid/text.lc.rm.$l
+
+        nl -s ' ' -n rz $tmp/test.tok.$l | awk '{print "utt" $0}' > data/test/text.tc.$l
+        nl -s ' ' -n rz $tmp/test.tok.rm.$l | awk '{print "utt" $0}' > data/test/text.tc.rm.$l
+        nl -s ' ' -n rz $tmp/test.tok.lc.$l | awk '{print "utt" $0}' > data/test/text.lc.$l
+        nl -s ' ' -n rz $tmp/test.tok.lc.rm.$l | awk '{print "utt" $0}' > data/test/text.lc.rm.$l
+
+    done
+fi
diff --git a/egs2/iwslt14/mt1/local/path.sh b/egs2/iwslt14/mt1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/iwslt14/mt1/mt.sh b/egs2/iwslt14/mt1/mt.sh
new file mode 120000
index 00000000000..9f4c1d5c0bb
--- /dev/null
+++ b/egs2/iwslt14/mt1/mt.sh
@@ -0,0 +1 @@
+../../TEMPLATE/mt1/mt.sh
\ No newline at end of file
diff --git a/egs2/iwslt14/mt1/path.sh b/egs2/iwslt14/mt1/path.sh
new file mode 120000
index 00000000000..a2d87d29a46
--- /dev/null
+++ b/egs2/iwslt14/mt1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/mt1/path.sh
\ No newline at end of file
diff --git a/egs2/iwslt14/mt1/pyscripts b/egs2/iwslt14/mt1/pyscripts
new file mode 120000
index 00000000000..bca5bde44f3
--- /dev/null
+++ b/egs2/iwslt14/mt1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/mt1/pyscripts
\ No newline at end of file
diff --git a/egs2/iwslt14/mt1/run.sh b/egs2/iwslt14/mt1/run.sh
new file mode 100755
index 00000000000..7df3ae1d99d
--- /dev/null
+++ b/egs2/iwslt14/mt1/run.sh
@@ -0,0 +1,51 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+src_lang=de
+tgt_lang=en
+
+train_set=train
+train_dev=valid
+test_sets="test valid"
+
+mt_config=conf/train_mt_transformer.yaml
+inference_config=conf/decode_mt.yaml
+
+src_nbpe=1000
+tgt_nbpe=10000   # if token_joint is True, then only tgt_nbpe is used
+
+# tc: truecase
+# lc: lowercase
+# lc.rm: lowercase with punctuation removal
+# Note, it is best to keep tgt_case as tc to match IWSLT22 eval
+src_case=tc
+tgt_case=tc
+
+./mt.sh \
+    --ignore_init_mismatch true \
+    --use_lm false \
+    --token_joint true \
+    --ngpu 1 \
+    --nj 16 \
+    --inference_nj 32 \
+    --src_lang ${src_lang} \
+    --tgt_lang ${tgt_lang} \
+    --src_token_type "bpe" \
+    --src_nbpe $src_nbpe \
+    --tgt_token_type "bpe" \
+    --tgt_nbpe $tgt_nbpe \
+    --src_case ${src_case} \
+    --tgt_case ${tgt_case} \
+    --feats_type raw \
+    --mt_config "${mt_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${train_dev}" \
+    --test_sets "${test_sets}" \
+    --src_bpe_train_text "data/${train_set}/text.${src_case}.${src_lang}" \
+    --tgt_bpe_train_text "data/${train_set}/text.${tgt_case}.${tgt_lang}" \
+    --lm_train_text "data/${train_set}/text.${tgt_case}.${tgt_lang}" "$@"
diff --git a/egs2/iwslt14/mt1/scripts b/egs2/iwslt14/mt1/scripts
new file mode 120000
index 00000000000..1c11b3c3c7b
--- /dev/null
+++ b/egs2/iwslt14/mt1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/mt1/scripts
\ No newline at end of file
diff --git a/egs2/iwslt14/mt1/steps b/egs2/iwslt14/mt1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/iwslt14/mt1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/iwslt14/mt1/utils b/egs2/iwslt14/mt1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/iwslt14/mt1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/iwslt22_dialect/asr1/RESULTS.md b/egs2/iwslt22_dialect/asr1/RESULTS.md
new file mode 100644
index 00000000000..510e71cdddd
--- /dev/null
+++ b/egs2/iwslt22_dialect/asr1/RESULTS.md
@@ -0,0 +1,53 @@
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+# RESULTS
+## Environments
+- date: `Wed Feb  2 05:32:30 EST 2022`
+- python version: `3.8.12 (default, Oct 12 2021, 13:49:34)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.6a1`
+- pytorch version: `pytorch 1.8.1`
+- Git hash: `99581e0f5af3ad68851d556645e7292771436df9`
+  - Commit date: `Sat Jan 29 11:32:38 2022 -0500`
+
+## asr_train_asr_conformer_ctc0.3_lr2e-3_warmup15k_newspecaug_raw_bpe1000_sp
+  - Model link: [huggingface](https://huggingface.co/espnet/brianyan918_iwslt22_dialect_train_asr_conformer_ctc0.3_lr2e-3_warmup15k_newspecaug)
+  - ASR config: [./conf/train_asr_conformer.yaml](./conf/train_asr_conformer.yaml)
+  - Inference config: [./conf/decode_asr.yaml](./conf/decode_asr.yaml)
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/test1|4204|27370|54.7|39.5|5.8|8.8|54.2|87.9|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/test1|4204|145852|84.1|7.1|8.8|11.5|27.4|87.9|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/test1|4204|64424|63.8|22.8|13.4|12.2|48.3|87.9|
+
+## asr_transformer_fisherlike_4gpu_bbins16m_raw_bpe1000_sp
+  - Model link: [huggingface](https://huggingface.co/espnet/brianyan918_iwslt22_dialect_transformer_fisherlike)
+  - ASR config: [./conf/train_asr_transformer.yaml](./conf/train_asr_transformer.yaml)
+  - Inference config: [./conf/decode_asr.yaml](./conf/decode_asr.yaml)
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/test1|4204|27370|53.4|41.1|5.5|9.5|56.1|88.2|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/test1|4204|145852|83.8|7.5|8.7|12.2|28.4|88.2|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/test1|4204|64424|62.9|23.9|13.3|13.4|50.5|88.2|
diff --git a/egs2/iwslt22_dialect/asr1/asr.sh b/egs2/iwslt22_dialect/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/iwslt22_dialect/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/iwslt22_dialect/asr1/cmd.sh b/egs2/iwslt22_dialect/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/iwslt22_dialect/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/iwslt22_dialect/asr1/conf/decode_asr.yaml b/egs2/iwslt22_dialect/asr1/conf/decode_asr.yaml
new file mode 100644
index 00000000000..bd7f53a63fa
--- /dev/null
+++ b/egs2/iwslt22_dialect/asr1/conf/decode_asr.yaml
@@ -0,0 +1,5 @@
+batch_size: 1
+beam_size: 10
+lm_weight: 0.0
+ctc_weight: 0.3
+penalty: 0.3
diff --git a/egs2/iwslt22_dialect/asr1/conf/fbank.conf b/egs2/iwslt22_dialect/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/iwslt22_dialect/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/iwslt22_dialect/asr1/conf/pbs.conf b/egs2/iwslt22_dialect/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/iwslt22_dialect/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/iwslt22_dialect/asr1/conf/pitch.conf b/egs2/iwslt22_dialect/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/iwslt22_dialect/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/iwslt22_dialect/asr1/conf/queue.conf b/egs2/iwslt22_dialect/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/iwslt22_dialect/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/iwslt22_dialect/asr1/conf/rnn_lm.yaml b/egs2/iwslt22_dialect/asr1/conf/rnn_lm.yaml
new file mode 100644
index 00000000000..73f5e90c161
--- /dev/null
+++ b/egs2/iwslt22_dialect/asr1/conf/rnn_lm.yaml
@@ -0,0 +1,16 @@
+lm_conf:        
+    nlayers: 4
+    unit: 2048
+optim: adam
+optim_conf:
+    lr: 0.001
+batch_type: folded
+batch_size: 400   # batch size in LM training
+max_epoch: 20     # if the data size is large, we can reduce this
+patience: 3
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
diff --git a/egs2/iwslt22_dialect/asr1/conf/slurm.conf b/egs2/iwslt22_dialect/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/iwslt22_dialect/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/iwslt22_dialect/asr1/conf/train_asr_conformer.yaml b/egs2/iwslt22_dialect/asr1/conf/train_asr_conformer.yaml
new file mode 120000
index 00000000000..e9703edbacf
--- /dev/null
+++ b/egs2/iwslt22_dialect/asr1/conf/train_asr_conformer.yaml
@@ -0,0 +1 @@
+tuning/train_asr_conformer_ctc0.3_lr2e-3_warmup15k_newspecaug.yaml
\ No newline at end of file
diff --git a/egs2/iwslt22_dialect/asr1/conf/train_asr_transformer.yaml b/egs2/iwslt22_dialect/asr1/conf/train_asr_transformer.yaml
new file mode 120000
index 00000000000..2ae798bee1b
--- /dev/null
+++ b/egs2/iwslt22_dialect/asr1/conf/train_asr_transformer.yaml
@@ -0,0 +1 @@
+tuning/transformer_fisherlike_4gpu_bbins16m_fix.yaml
\ No newline at end of file
diff --git a/egs2/iwslt22_dialect/asr1/conf/tuning/train_asr_conformer_ctc0.3_lr2e-3_warmup15k_newspecaug.yaml b/egs2/iwslt22_dialect/asr1/conf/tuning/train_asr_conformer_ctc0.3_lr2e-3_warmup15k_newspecaug.yaml
new file mode 100644
index 00000000000..58f14706d53
--- /dev/null
+++ b/egs2/iwslt22_dialect/asr1/conf/tuning/train_asr_conformer_ctc0.3_lr2e-3_warmup15k_newspecaug.yaml
@@ -0,0 +1,73 @@
+batch_type: numel
+batch_bins: 25000000
+accum_grad: 2
+max_epoch: 80
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+encoder: conformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    rel_pos_type: latest
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    use_cnn_module: true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.002
+    weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 15000
+
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 27
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_ratio_range:
+    - 0.
+    - 0.05
+    num_time_mask: 5
diff --git a/egs2/iwslt22_dialect/asr1/conf/tuning/transformer_fisherlike_4gpu_bbins16m_fix.yaml b/egs2/iwslt22_dialect/asr1/conf/tuning/transformer_fisherlike_4gpu_bbins16m_fix.yaml
new file mode 100644
index 00000000000..51aca81472e
--- /dev/null
+++ b/egs2/iwslt22_dialect/asr1/conf/tuning/transformer_fisherlike_4gpu_bbins16m_fix.yaml
@@ -0,0 +1,70 @@
+# network architecture
+
+# frontend related
+frontend: default
+frontend_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 160
+
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: conv2d
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: embed
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# loss related
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# optimization related
+optim: adam
+accum_grad: 2
+grad_clip: 3
+max_epoch: 100
+optim_conf:
+    lr: 5.0
+scheduler: noamlr
+scheduler_conf:
+    model_size: 256
+    warmup_steps: 25000
+
+# minibatch related
+batch_type: numel
+batch_bins: 16000000
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/iwslt22_dialect/asr1/db.sh b/egs2/iwslt22_dialect/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/iwslt22_dialect/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/iwslt22_dialect/asr1/local/data.sh b/egs2/iwslt22_dialect/asr1/local/data.sh
new file mode 100755
index 00000000000..f6359a12d13
--- /dev/null
+++ b/egs2/iwslt22_dialect/asr1/local/data.sh
@@ -0,0 +1,84 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+. ./db.sh || exit 1;
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+stage=1
+stop_stage=100000
+splits_dir=data/iwslt22_splits
+
+log "$0 $*"
+. utils/parse_options.sh
+
+if [ -z "${IWSLT22_DIALECT}" ]; then
+    log "Fill the value of 'IWSLT22_DIALECT' of db.sh"
+    exit 1
+fi
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ] && [ ! -d "${splits_dir}" ]; then
+    log "stage 1: Official splits from IWSLT"
+    
+    git clone https://github.com/kevinduh/iwslt22-dialect.git ${splits_dir}
+    cd ${splits_dir} && ./setup_data.sh ${IWSLT22_DIALECT} && cd -
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: Data Preparation"
+    
+    mkdir -p data/train
+    mkdir -p data/dev
+    mkdir -p data/test1
+    local/preprocess.py --out data --data ${splits_dir}
+    
+    for set in train dev test1
+    do
+        utils/utt2spk_to_spk2utt.pl data/${set}/utt2spk > data/${set}/spk2utt
+        utils/fix_data_dir.sh data/${set}
+        utils/validate_data_dir.sh --no-feats data/${set} || exit 1
+    done
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    log "stage 3: Normalize Transcripts"
+
+    # check extra module installation
+    if ! command -v tokenizer.perl > /dev/null; then
+        echo "Error: it seems that moses is not installed." >&2
+        echo "Error: please install moses as follows." >&2
+        echo "Error: cd ${MAIN_ROOT}/tools && make moses.done" >&2
+        exit 1
+    fi
+
+    for set in train dev test1
+    do
+        cut -d ' ' -f 2- data/${set}/text > data/${set}/text.org
+        cut -d ' ' -f 1 data/${set}/text > data/${set}/uttlist
+        # remove punctuation
+        remove_punctuation.pl < data/${set}/text.org > data/${set}/text.rm
+        paste -d ' ' data/${set}/uttlist data/${set}/text.rm > data/${set}/text.tc.rm
+
+        # remove empty lines that were previously only punctuation
+        <"data/${set}/text.tc.rm" awk ' { if( NF != 1 ) print $0; } ' >"data/${set}/text"
+        utils/fix_data_dir.sh --utt_extra_files "text.tc.rm" data/${set}
+        utils/validate_data_dir.sh --no-feats data/${set} || exit 1
+    done
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/iwslt22_dialect/asr1/local/path.sh b/egs2/iwslt22_dialect/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/iwslt22_dialect/asr1/local/preprocess.py b/egs2/iwslt22_dialect/asr1/local/preprocess.py
new file mode 100755
index 00000000000..bbd1e42d342
--- /dev/null
+++ b/egs2/iwslt22_dialect/asr1/local/preprocess.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+# -*- encoding: utf8 -*-
+
+"""
+   TBD
+"""
+
+import re
+import os
+import sys
+import argparse
+import itertools
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--out",
+    "-o",
+    type=str,
+    help="Path to output directory.",
+)
+parser.add_argument("--data", "-d", type=str, help="Path to original corpus.")
+args = parser.parse_args()
+
+
+def time_to_hash(time_str):
+    ret = "%08.3F" % float(time_str)
+    return "".join(str(ret).split("."))
+
+
+def stm_to_kaldi(st_stm, asr_stm, dst):
+    data = {"F": [], "C": [], "S": [], "BT": [], "ET": [], "text_en": [], "text_ta": []}
+    with open(st_stm, "r", encoding="utf-8") as st_stm, open(
+        asr_stm, "r", encoding="utf-8"
+    ) as asr_stm:
+        st_lines = st_stm.readlines()
+        asr_lines = asr_stm.readlines()
+        for i, (st_li, asr_li) in enumerate(zip(st_lines, asr_lines)):
+            F, C, S, BT, ET, _, text_en = st_li.strip().split("\t")
+            F2, _, _, _, _, _, text_ta = asr_li.strip().split("\t")
+            if F != F2:
+                sys.exit("ASR and ST STM files are not in the same order", F, F2)
+            data["F"].append(F)
+            data["C"].append(C)
+            data["S"].append(S)
+            data["BT"].append(BT)
+            data["ET"].append(ET)
+            data["text_en"].append(text_en)
+            data["text_ta"].append(text_ta)
+
+    with open(dst + "/wav.scp", "w", encoding="utf-8") as wav_scp, open(
+        dst + "/utt2spk", "w", encoding="utf-8"
+    ) as utt2spk, open(dst + "/segments", "w", encoding="utf-8") as segments, open(
+        dst + "/text", "w", encoding="utf-8"
+    ) as text_ta, open(
+        dst + "/reco2file_and_channel", "w", encoding="utf-8"
+    ) as reco2file:
+        for i in range(len(data["F"])):
+            recid = data["F"][i].split("/")[-1].split(".")[0]
+            uttid = (
+                data["S"][i]
+                + "_"
+                + recid
+                + "_"
+                + time_to_hash(data["BT"][i])
+                + "-"
+                + time_to_hash(data["ET"][i])
+            )
+            sox_cmd = "sox -R -t wav - -t wav - rate 16000 dither |"
+            wav_scp.write(
+                " ".join(
+                    [
+                        recid,
+                        "sph2pipe -f wav -p -c",
+                        data["C"][i],
+                        data["F"][i],
+                        "|",
+                        sox_cmd,
+                    ]
+                )
+                + "\n"
+            )
+            utt2spk.write(" ".join([uttid, data["S"][i]]) + "\n")
+            segments.write(
+                " ".join([uttid, recid, data["BT"][i], data["ET"][i]]) + "\n"
+            )
+            text_ta.write(" ".join([uttid, data["text_ta"][i]]) + "\n")
+            # 2 channels are stored as separate sph, each with only 1 channel
+            reco2file.write(" ".join([recid, recid, "A"]) + "\n")
+
+
+if __name__ == "__main__":
+    stm_to_kaldi(
+        args.data + "/stm/st-aeb2eng.norm.train.stm",
+        args.data + "/stm/asr-aeb.norm.train.stm",
+        args.out + "/train",
+    )
+    stm_to_kaldi(
+        args.data + "/stm/st-aeb2eng.norm.dev.stm",
+        args.data + "/stm/asr-aeb.norm.dev.stm",
+        args.out + "/dev",
+    )
+    stm_to_kaldi(
+        args.data + "/stm/st-aeb2eng.norm.test1.stm",
+        args.data + "/stm/asr-aeb.norm.test1.stm",
+        args.out + "/test1",
+    )
diff --git a/egs2/iwslt22_dialect/asr1/path.sh b/egs2/iwslt22_dialect/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/iwslt22_dialect/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/iwslt22_dialect/asr1/pyscripts b/egs2/iwslt22_dialect/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/iwslt22_dialect/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/iwslt22_dialect/asr1/run.sh b/egs2/iwslt22_dialect/asr1/run.sh
new file mode 100755
index 00000000000..9990ba06e38
--- /dev/null
+++ b/egs2/iwslt22_dialect/asr1/run.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train
+train_dev=dev
+test_set=test1
+
+asr_config=conf/train_asr_conformer.yaml
+inference_config=conf/decode_asr.yaml
+
+nbpe=1000
+
+./asr.sh \
+    --ngpu 1 \
+    --stage 1 \
+    --stop_stage 13 \
+    --audio_format "flac.ark" \
+    --local_data_opts "--stage 0" \
+    --use_lm false \
+    --token_type bpe \
+    --nbpe $nbpe \
+    --feats_type raw \
+    --speed_perturb_factors "0.9 1.0 1.1" \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${train_dev}" \
+    --test_sets "${test_set}" \
+    --inference_nj 40 \
+    --bpe_train_text "data/${train_set}/text" \
+    --lm_train_text "data/${train_set}/text" "$@"
diff --git a/egs2/iwslt22_dialect/asr1/scripts b/egs2/iwslt22_dialect/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/iwslt22_dialect/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/iwslt22_dialect/asr1/steps b/egs2/iwslt22_dialect/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/iwslt22_dialect/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/iwslt22_dialect/asr1/utils b/egs2/iwslt22_dialect/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/iwslt22_dialect/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/iwslt22_dialect/st1/RESULTS.md b/egs2/iwslt22_dialect/st1/RESULTS.md
new file mode 100644
index 00000000000..30c83a47034
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/RESULTS.md
@@ -0,0 +1,29 @@
+<!-- Generated by scripts/utils/show_st_results.sh -->
+# RESULTS
+## Environments
+- date: `Tue Feb  8 12:54:12 EST 2022`
+- python version: `3.8.12 (default, Oct 12 2021, 13:49:34)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.8.1`
+- Git hash: `77fce65312877a132bbae01917ad26b74f6e2e14`
+  - Commit date: `Tue Feb 8 10:48:10 2022 -0500`
+
+## st_train_st_conformer_ctc0.3_lr2e-3_warmup15k_newspecaug_raw_bpe_tc1000_sp
+  - Model link: [huggingface](https://huggingface.co/espnet/brianyan918_iwslt22_dialect_train_st_conformer_ctc0.3_lr2e-3_warmup15k_newspecaug)
+  - ST config: [.conf/train_st_conformer.yaml](.conf/train_st_conformer.yaml)
+  - Inference config: [./conf/decode_st_conformer.yaml](./conf/decode_st_conformer.yaml)
+### BLEU
+
+|dataset|bleu_score|verbose_score|
+|---|---|---|
+pen2_st_model_valid.acc.ave|13.9|44.0/21.8/11.4/6.2 (BP = 0.859 ratio = 0.868 hyp_len = 36614 ref_len = 42181)
+
+## st_transformer_fisherlike_4gpu_bbins16m_fix_raw_bpe_tc1000_sp
+  - Model link: [huggingface](https://huggingface.co/espnet/brianyan918_iwslt22_dialect_st_transformer_fisherlike_4gpu_bbins16m_fix)
+  - ST config: [.conf/train_st_transformer.yaml](.conf/train_st_transformer.yaml)
+  - Inference config: [./conf/decode_st_transformer.yaml](./conf/decode_st_transformer.yaml)
+### BLEU
+
+|dataset|bleu_score|verbose_score|
+|---|---|---|
+pen3_st_model_valid.acc.ave|12.0|37.4/17.3/8.6/4.5 (BP = 0.952 ratio = 0.953 hyp_len = 40192 ref_len = 42181)
diff --git a/egs2/iwslt22_dialect/st1/cmd.sh b/egs2/iwslt22_dialect/st1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/iwslt22_dialect/st1/conf/decode_st_conformer.yaml b/egs2/iwslt22_dialect/st1/conf/decode_st_conformer.yaml
new file mode 120000
index 00000000000..86cc69be5fc
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/conf/decode_st_conformer.yaml
@@ -0,0 +1 @@
+tuning/decode_pen2.yaml
\ No newline at end of file
diff --git a/egs2/iwslt22_dialect/st1/conf/decode_st_transformer.yaml b/egs2/iwslt22_dialect/st1/conf/decode_st_transformer.yaml
new file mode 120000
index 00000000000..1eb8cb0438e
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/conf/decode_st_transformer.yaml
@@ -0,0 +1 @@
+tuning/decode_pen3.yaml
\ No newline at end of file
diff --git a/egs2/iwslt22_dialect/st1/conf/fbank.conf b/egs2/iwslt22_dialect/st1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/iwslt22_dialect/st1/conf/pbs.conf b/egs2/iwslt22_dialect/st1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/iwslt22_dialect/st1/conf/pitch.conf b/egs2/iwslt22_dialect/st1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/iwslt22_dialect/st1/conf/queue.conf b/egs2/iwslt22_dialect/st1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/iwslt22_dialect/st1/conf/slurm.conf b/egs2/iwslt22_dialect/st1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/iwslt22_dialect/st1/conf/train_st.yaml b/egs2/iwslt22_dialect/st1/conf/train_st.yaml
new file mode 100644
index 00000000000..168a6a7a174
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/conf/train_st.yaml
@@ -0,0 +1,87 @@
+# network architecture
+
+# frontend related
+frontend: default
+frontend_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 160
+
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: conv2d
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: embed
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+extra_asr_decoder: transformer
+extra_asr_decoder_conf:
+    input_layer: embed
+    num_blocks: 2
+    linear_units: 2048
+    dropout_rate: 0.1
+
+extra_mt_decoder: transformer
+extra_mt_decoder_conf:
+    input_layer: embed
+    num_blocks: 2
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# loss related
+model_conf:
+    asr_weight: 0.3
+    mt_weight: 0.0
+    mtlalpha: 1.0
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# optimization related
+optim: adam
+accum_grad: 2
+grad_clip: 3
+max_epoch: 50
+optim_conf:
+    lr: 2.5
+scheduler: noamlr
+scheduler_conf:
+    model_size: 256
+    warmup_steps: 25000
+
+# minibatch related
+batch_type: folded
+batch_bins: 64
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/iwslt22_dialect/st1/conf/train_st_conformer.yaml b/egs2/iwslt22_dialect/st1/conf/train_st_conformer.yaml
new file mode 120000
index 00000000000..701efc0b460
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/conf/train_st_conformer.yaml
@@ -0,0 +1 @@
+tuning/train_st_conformer_ctc0.3_lr2e-3_warmup15k_newspecaug.yaml
\ No newline at end of file
diff --git a/egs2/iwslt22_dialect/st1/conf/train_st_transformer.yaml b/egs2/iwslt22_dialect/st1/conf/train_st_transformer.yaml
new file mode 120000
index 00000000000..719cfa93f1a
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/conf/train_st_transformer.yaml
@@ -0,0 +1 @@
+tuning/transformer_fisherlike_4gpu_bbins16m.yaml
\ No newline at end of file
diff --git a/egs2/iwslt22_dialect/st1/conf/tuning/decode_pen2.yaml b/egs2/iwslt22_dialect/st1/conf/tuning/decode_pen2.yaml
new file mode 100644
index 00000000000..2842896581f
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/conf/tuning/decode_pen2.yaml
@@ -0,0 +1,6 @@
+batch_size: 1
+beam_size: 10
+penalty: 0.2
+maxlenratio: 0.0
+minlenratio: 0.0
+lm_weight: 0.0
\ No newline at end of file
diff --git a/egs2/iwslt22_dialect/st1/conf/tuning/decode_pen3.yaml b/egs2/iwslt22_dialect/st1/conf/tuning/decode_pen3.yaml
new file mode 100644
index 00000000000..a388feca9f4
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/conf/tuning/decode_pen3.yaml
@@ -0,0 +1,6 @@
+batch_size: 1
+beam_size: 10
+penalty: 0.3
+maxlenratio: 0.0
+minlenratio: 0.0
+lm_weight: 0.0
\ No newline at end of file
diff --git a/egs2/iwslt22_dialect/st1/conf/tuning/train_st_conformer_ctc0.3_lr2e-3_warmup15k_newspecaug.yaml b/egs2/iwslt22_dialect/st1/conf/tuning/train_st_conformer_ctc0.3_lr2e-3_warmup15k_newspecaug.yaml
new file mode 100644
index 00000000000..cf0d0d8f15d
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/conf/tuning/train_st_conformer_ctc0.3_lr2e-3_warmup15k_newspecaug.yaml
@@ -0,0 +1,89 @@
+batch_type: numel
+batch_bins: 25000000
+accum_grad: 2
+max_epoch: 80
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+encoder: conformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    rel_pos_type: latest
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    use_cnn_module: true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+extra_asr_decoder: transformer
+extra_asr_decoder_conf:
+    input_layer: embed
+    num_blocks: 2
+    linear_units: 2048
+    dropout_rate: 0.1
+
+extra_mt_decoder: transformer
+extra_mt_decoder_conf:
+    input_layer: embed
+    num_blocks: 2
+    linear_units: 2048
+    dropout_rate: 0.1
+
+model_conf:
+    asr_weight: 0.3
+    mt_weight: 0.0
+    mtlalpha: 1.0
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.002
+    weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 15000
+
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 27
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_ratio_range:
+    - 0.
+    - 0.05
+    num_time_mask: 5
diff --git a/egs2/iwslt22_dialect/st1/conf/tuning/transformer_fisherlike_4gpu_bbins16m.yaml b/egs2/iwslt22_dialect/st1/conf/tuning/transformer_fisherlike_4gpu_bbins16m.yaml
new file mode 100644
index 00000000000..a5bb48bae78
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/conf/tuning/transformer_fisherlike_4gpu_bbins16m.yaml
@@ -0,0 +1,86 @@
+# network architecture
+
+# frontend related
+frontend: default
+frontend_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 160
+
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: conv2d
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: embed
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+extra_asr_decoder: transformer
+extra_asr_decoder_conf:
+    input_layer: embed
+    num_blocks: 2
+    linear_units: 2048
+    dropout_rate: 0.1
+
+extra_mt_decoder: transformer
+extra_mt_decoder_conf:
+    input_layer: embed
+    num_blocks: 2
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# loss related
+model_conf:
+    asr_weight: 0.3
+    mt_weight: 0.0
+    mtlalpha: 1.0
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# optimization related
+optim: adam
+accum_grad: 2
+grad_clip: 3
+max_epoch: 50
+optim_conf:
+    lr: 12.5
+scheduler: noamlr
+scheduler_conf:
+    model_size: 256
+    warmup_steps: 25000
+
+# minibatch related
+batch_type: numel
+batch_bins: 16000000
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/iwslt22_dialect/st1/db.sh b/egs2/iwslt22_dialect/st1/db.sh
new file mode 120000
index 00000000000..39ab78266e5
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/st1/db.sh
\ No newline at end of file
diff --git a/egs2/iwslt22_dialect/st1/local/data.sh b/egs2/iwslt22_dialect/st1/local/data.sh
new file mode 100755
index 00000000000..aca26a59402
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/local/data.sh
@@ -0,0 +1,93 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+. ./db.sh || exit 1;
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+stage=1
+stop_stage=100000
+splits_dir=data/iwslt22_splits
+
+log "$0 $*"
+. utils/parse_options.sh
+
+if [ -z "${IWSLT22_DIALECT}" ]; then
+    log "Fill the value of 'IWSLT22_DIALECT' of db.sh"
+    exit 1
+fi
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ] && [ ! -d "${splits_dir}" ]; then
+    log "stage 1: Official splits from IWSLT"
+    
+    git clone https://github.com/kevinduh/iwslt22-dialect.git ${splits_dir}
+    cd ${splits_dir} && ./setup_data.sh ${IWSLT22_DIALECT} && cd -
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: Data Preparation"
+    
+    mkdir -p data/train
+    mkdir -p data/dev
+    mkdir -p data/test1
+    local/preprocess.py --out data --data ${splits_dir}
+    
+    for set in train dev test1
+    do
+        cp data/${set}/text.en data/${set}/text
+        utils/utt2spk_to_spk2utt.pl data/${set}/utt2spk > data/${set}/spk2utt
+        utils/fix_data_dir.sh --utt_extra_files "text.en text.ta" data/${set}
+        utils/validate_data_dir.sh --no-feats data/${set} || exit 1
+    done
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    log "stage 3: Normalize Transcripts"
+
+    # check extra module installation
+    if ! command -v tokenizer.perl > /dev/null; then
+        echo "Error: it seems that moses is not installed." >&2
+        echo "Error: please install moses as follows." >&2
+        echo "Error: cd ${MAIN_ROOT}/tools && make moses.done" >&2
+        exit 1
+    fi
+
+    for set in train dev test1
+    do
+        cut -d ' ' -f 2- data/${set}/text.ta > data/${set}/ta.org
+        cut -d ' ' -f 1 data/${set}/text.ta > data/${set}/uttlist
+        # remove punctuation
+        remove_punctuation.pl < data/${set}/ta.org > data/${set}/ta.rm
+        paste -d ' ' data/${set}/uttlist data/${set}/ta.rm > data/${set}/text.tc.rm.ta
+
+        cut -d ' ' -f 2- data/${set}/text.en > data/${set}/en.org
+        # tokenize
+        tokenizer.perl -l en -q < data/${set}/en.org > data/${set}/en.tok
+        paste -d ' ' data/${set}/uttlist data/${set}/en.tok > data/${set}/text.tc.en
+
+        # remove empty lines that were previously only punctuation
+        # small to use fix_data_dir as is, where it does reduce lines based on extra files
+        <"data/${set}/text.tc.rm.ta" awk ' { if( NF != 1 ) print $0; } ' >"data/${set}/text"
+        utils/fix_data_dir.sh --utt_extra_files "text.tc.rm.ta text.tc.en text.en text.ta" data/${set}
+        cp data/${set}/text.tc.en data/${set}/text
+        utils/fix_data_dir.sh --utt_extra_files "text.tc.rm.ta text.tc.en text.en text.ta" data/${set}
+        utils/validate_data_dir.sh --no-feats data/${set} || exit 1
+    done
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/iwslt22_dialect/st1/local/path.sh b/egs2/iwslt22_dialect/st1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/iwslt22_dialect/st1/local/preprocess.py b/egs2/iwslt22_dialect/st1/local/preprocess.py
new file mode 100755
index 00000000000..2d02de1eb64
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/local/preprocess.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+# -*- encoding: utf8 -*-
+
+"""
+   TBD
+"""
+
+import re
+import os
+import sys
+import argparse
+import itertools
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--out",
+    "-o",
+    type=str,
+    help="Path to output directory.",
+)
+parser.add_argument("--data", "-d", type=str, help="Path to original corpus.")
+args = parser.parse_args()
+
+
+def time_to_hash(time_str):
+    ret = "%08.3F" % float(time_str)
+    return "".join(str(ret).split("."))
+
+
+def stm_to_kaldi(st_stm, asr_stm, dst):
+    data = {"F": [], "C": [], "S": [], "BT": [], "ET": [], "text_en": [], "text_ta": []}
+    with open(st_stm, "r", encoding="utf-8") as st_stm, open(
+        asr_stm, "r", encoding="utf-8"
+    ) as asr_stm:
+        st_lines = st_stm.readlines()
+        asr_lines = asr_stm.readlines()
+        for i, (st_li, asr_li) in enumerate(zip(st_lines, asr_lines)):
+            F, C, S, BT, ET, _, text_en = st_li.strip().split("\t")
+            F2, _, _, _, _, _, text_ta = asr_li.strip().split("\t")
+            if F != F2:
+                sys.exit("ASR and ST STM files are not in the same order", F, F2)
+            data["F"].append(F)
+            data["C"].append(C)
+            data["S"].append(S)
+            data["BT"].append(BT)
+            data["ET"].append(ET)
+            data["text_en"].append(text_en)
+            data["text_ta"].append(text_ta)
+
+    with open(dst + "/wav.scp", "w", encoding="utf-8") as wav_scp, open(
+        dst + "/utt2spk", "w", encoding="utf-8"
+    ) as utt2spk, open(dst + "/segments", "w", encoding="utf-8") as segments, open(
+        dst + "/text.en", "w", encoding="utf-8"
+    ) as text_en, open(
+        dst + "/text.ta", "w", encoding="utf-8"
+    ) as text_ta, open(
+        dst + "/reco2file_and_channel", "w", encoding="utf-8"
+    ) as reco2file:
+        for i in range(len(data["F"])):
+            recid = data["F"][i].split("/")[-1].split(".")[0]
+            uttid = (
+                data["S"][i]
+                + "_"
+                + recid
+                + "_"
+                + time_to_hash(data["BT"][i])
+                + "-"
+                + time_to_hash(data["ET"][i])
+            )
+            sox_cmd = "sox -R -t wav - -t wav - rate 16000 dither |"
+            wav_scp.write(
+                " ".join(
+                    [
+                        recid,
+                        "sph2pipe -f wav -p -c",
+                        data["C"][i],
+                        data["F"][i],
+                        "|",
+                        sox_cmd,
+                    ]
+                )
+                + "\n"
+            )
+            utt2spk.write(" ".join([uttid, data["S"][i]]) + "\n")
+            segments.write(
+                " ".join([uttid, recid, data["BT"][i], data["ET"][i]]) + "\n"
+            )
+            text_en.write(" ".join([uttid, data["text_en"][i]]) + "\n")
+            text_ta.write(" ".join([uttid, data["text_ta"][i]]) + "\n")
+            # 2 channels are stored as separate sph, each with only 1 channel
+            reco2file.write(" ".join([recid, recid, "A"]) + "\n")
+
+
+if __name__ == "__main__":
+    stm_to_kaldi(
+        args.data + "/stm/st-aeb2eng.norm.train.stm",
+        args.data + "/stm/asr-aeb.norm.train.stm",
+        args.out + "/train",
+    )
+    stm_to_kaldi(
+        args.data + "/stm/st-aeb2eng.norm.dev.stm",
+        args.data + "/stm/asr-aeb.norm.dev.stm",
+        args.out + "/dev",
+    )
+    stm_to_kaldi(
+        args.data + "/stm/st-aeb2eng.norm.test1.stm",
+        args.data + "/stm/asr-aeb.norm.test1.stm",
+        args.out + "/test1",
+    )
diff --git a/egs2/iwslt22_dialect/st1/path.sh b/egs2/iwslt22_dialect/st1/path.sh
new file mode 120000
index 00000000000..8e43dca7d4d
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/st1/path.sh
\ No newline at end of file
diff --git a/egs2/iwslt22_dialect/st1/pyscripts b/egs2/iwslt22_dialect/st1/pyscripts
new file mode 120000
index 00000000000..f8eb6b2ab7b
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/st1/pyscripts
\ No newline at end of file
diff --git a/egs2/iwslt22_dialect/st1/run.sh b/egs2/iwslt22_dialect/st1/run.sh
new file mode 100755
index 00000000000..c977de7cddb
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/run.sh
@@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+src_lang=ta
+tgt_lang=en
+
+train_set=train
+train_dev=dev
+test_set=test1
+
+st_config=conf/train_st_conformer.yaml
+inference_config=conf/decode_st.yaml
+
+src_nbpe=1000
+tgt_nbpe=1000
+
+# tc: truecase
+# lc: lowercase
+# lc.rm: lowercase with punctuation removal
+# Note, it is best to keep tgt_case as tc to match IWSLT22 eval
+src_case=tc.rm
+tgt_case=tc
+
+./st.sh \
+    --ignore_init_mismatch true \
+    --stage 1 \
+    --stop_stage 13 \
+    --use_lm false \
+    --token_joint false \
+    --audio_format "flac.ark" \
+    --nj 40 \
+    --inference_nj 40 \
+    --audio_format "flac.ark" \
+    --src_lang ${src_lang} \
+    --tgt_lang ${tgt_lang} \
+    --src_token_type "bpe" \
+    --src_nbpe $src_nbpe \
+    --tgt_token_type "bpe" \
+    --tgt_nbpe $tgt_nbpe \
+    --src_case ${src_case} \
+    --tgt_case ${tgt_case} \
+    --feats_type raw \
+    --speed_perturb_factors "0.9 1.0 1.1" \
+    --st_config "${st_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${train_dev}" \
+    --test_sets "${test_set}" \
+    --src_bpe_train_text "data/${train_set}/text.${src_case}.${src_lang}" \
+    --tgt_bpe_train_text "data/${train_set}/text.${tgt_case}.${tgt_lang}" \
+    --lm_train_text "data/${train_set}/text.${tgt_case}.${tgt_lang}"  "$@"
diff --git a/egs2/iwslt22_dialect/st1/scripts b/egs2/iwslt22_dialect/st1/scripts
new file mode 120000
index 00000000000..c5b75bb5b0a
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/st1/scripts
\ No newline at end of file
diff --git a/egs2/iwslt22_dialect/st1/st.sh b/egs2/iwslt22_dialect/st1/st.sh
new file mode 120000
index 00000000000..5c7465739e3
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/st.sh
@@ -0,0 +1 @@
+../../TEMPLATE/st1/st.sh
\ No newline at end of file
diff --git a/egs2/iwslt22_dialect/st1/steps b/egs2/iwslt22_dialect/st1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/iwslt22_dialect/st1/utils b/egs2/iwslt22_dialect/st1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/jvs/tts1/README.md b/egs2/jvs/tts1/README.md
index df7ae40297a..fa454b4dc02 100644
--- a/egs2/jvs/tts1/README.md
+++ b/egs2/jvs/tts1/README.md
@@ -195,7 +195,7 @@ $ ls downloads/f3698edf589206588f58f5ec837fa516/exp/exp/tts_train_vits_raw_phn_j
 config.yaml  images  train.total_count.ave_10best.pth
 ```
 
-Let us replace the `tokens.txt` and `feats_stats.npz` with pretrained model's one.
+Let us replace the `tokens.txt` with pretrained model's one.
 ```sh
 # Make backup (Rename -> *.bak)
 $ mv dump/22k/token_list/phn_jaconv_pyopenjtalk_accent_with_pause/tokens.{txt,txt.bak}
diff --git a/egs2/librispeech/asr1/README.md b/egs2/librispeech/asr1/README.md
index efee65b099f..99bde4c1cad 100644
--- a/egs2/librispeech/asr1/README.md
+++ b/egs2/librispeech/asr1/README.md
@@ -1,73 +1,321 @@
-# Self-supervised learning features [HuBERT_large_ll60k, Conformer, utt_mvn](conf/tuning/train_asr_conformer7_hubert_960hr_large.yaml) with [Transformer-LM](conf/tuning/train_lm_transformer2.yaml)
+# Conformer-RNN Transducer
 
 ## Environments
-- date: `Fri Aug  6 11:44:39 JST 2021`
-- python version: `3.7.9 (default, Apr 23 2021, 13:48:31)  [GCC 5.5.0 20171010]`
-- espnet version: `espnet 0.9.9`
-- pytorch version: `pytorch 1.7.0`
-- Git hash: `0f7558a716ab830d0c29da8785840124f358d47b`
-  - Commit date: `Tue Jun 8 15:33:49 2021 -0400`
-- Pretrained model: https://huggingface.co/espnet/xuankai_chang_librispeech_asr_train_asr_conformer7_hubert_960hr_large_raw_en_bpe5000_sp_26epoch
+- date: `Fri Mar 25 04:35:42 EDT 2022`
+- python version: `3.9.5 (default, Jun  4 2021, 12:28:51)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.8.1+cu111`
+- Git hash: `21d19be00089678ca27f7fce474ef8d787689512`
+  - Commit date: `Wed Mar 16 08:06:52 2022 -0400`
+- ASR config: [conf/train_rnnt_conformer.yaml](conf/train_rnnt_conformer.yaml)
+- Pretrained model: [https://huggingface.co/espnet/chai_librispeech_asr_train_rnnt_conformer_raw_en_bpe5000_sp](https://huggingface.co/espnet/chai_librispeech_asr_train_rnnt_conformer_raw_en_bpe5000_sp)
+
+## asr_train_rnnt_conformer_ngpu4_raw_en_bpe5000_sp
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_rnnt_conformer_asr_model_valid.loss.ave_10best/test_clean|2620|52576|97.2|2.5|0.3|0.3|3.1|35.2|
+|decode_rnnt_conformer_asr_model_valid.loss.ave_10best/test_other|2939|52343|93.4|6.0|0.6|0.8|7.4|56.3|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_rnnt_conformer_asr_model_valid.loss.ave_10best/test_clean|2620|281530|99.3|0.4|0.3|0.3|1.0|35.2|
+|decode_rnnt_conformer_asr_model_valid.loss.ave_10best/test_other|2939|272758|97.7|1.4|1.0|0.9|3.2|56.3|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_rnnt_conformer_asr_model_valid.loss.ave_10best/test_clean|2620|65818|96.6|2.4|1.0|0.5|3.9|35.2|
+|decode_rnnt_conformer_asr_model_valid.loss.ave_10best/test_other|2939|65101|92.1|5.9|2.0|1.3|9.2|56.3|
+
+# Self-supervised learning features [HuBERT_large_ll60k, Conformer, utt_mvn](conf/tuning/train_asr_conformer7_hubert_ll60k_large.yaml) with [Transformer-LM](conf/tuning/train_lm_transformer2.yaml)
+
+## Environments
+- date: `Sat Jan  1 23:24:39 EST 2022`
+- python version: `3.7.11 (default, Jul 27 2021, 14:32:16)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.5a1`
+- pytorch version: `pytorch 1.8.1`
+- Git hash: `37a5c7cdb84b1d2361f4a4fa08826b2873bf7753`
+  - Commit date: `Thu Nov 25 05:30:02 2021 +0000`
+- Pretrained model: https://huggingface.co/espnet/simpleoier_librispeech_asr_train_asr_conformer7_hubert_ll60k_large_raw_en_bpe5000_sp
 
 ### WER
+
 |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
 |---|---|---|---|---|---|---|---|---|
-|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_17epoch_asr_model_valid.acc.best/dev_clean|2703|54402|98.5|1.3|0.2|0.2|1.7|22.1|
-|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_17epoch_asr_model_valid.acc.best/dev_other|2864|50948|96.8|2.8|0.4|0.3|3.4|33.7|
-|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_17epoch_asr_model_valid.acc.best/test_clean|2620|52576|98.4|1.4|0.2|0.2|1.8|22.1|
-|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_17epoch_asr_model_valid.acc.best/test_other|2939|52343|96.8|2.8|0.4|0.4|3.6|36.0|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/dev_clean|2703|54402|98.5|1.4|0.2|0.2|1.7|22.9|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/dev_other|2864|50948|96.7|3.0|0.3|0.3|3.6|36.0|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_clean|2620|52576|98.4|1.4|0.2|0.2|1.8|23.4|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_other|2939|52343|96.6|3.1|0.3|0.4|3.7|37.2|
 
 ### CER
+
 |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
 |---|---|---|---|---|---|---|---|---|
-|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_17epoch_asr_model_valid.acc.best/dev_clean|2703|288456|99.6|0.2|0.2|0.2|0.6|22.1|
-|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_17epoch_asr_model_valid.acc.best/dev_other|2864|265951|98.8|0.6|0.6|0.3|1.5|33.7|
-|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_17epoch_asr_model_valid.acc.best/test_clean|2620|281530|99.6|0.2|0.2|0.2|0.6|22.1|
-|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_17epoch_asr_model_valid.acc.best/test_other|2939|272758|98.9|0.5|0.5|0.4|1.4|36.0|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/dev_clean|2703|288456|99.7|0.2|0.2|0.2|0.5|22.9|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/dev_other|2864|265951|99.0|0.6|0.5|0.4|1.4|36.0|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_clean|2620|281530|99.7|0.2|0.2|0.2|0.5|23.4|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_other|2939|272758|99.1|0.5|0.4|0.4|1.3|37.2|
 
 ### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/dev_clean|2703|68010|98.2|1.4|0.4|0.4|2.2|22.9|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/dev_other|2864|63110|96.1|3.0|0.9|0.8|4.7|36.0|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_clean|2620|65818|98.1|1.4|0.5|0.4|2.3|23.4|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_other|2939|65101|96.1|2.9|1.1|0.7|4.6|37.2|
+
+# Self-supervised learning features [WavLM_large, Conformer, utt_mvn](conf/tuning/train_asr_conformer7_wavlm_large.yaml) with [Transformer-LM](conf/tuning/train_lm_transformer2.yaml)
+
+## Environments
+- date: `Tue Jan  4 20:52:48 EST 2022`
+- python version: `3.7.11 (default, Jul 27 2021, 14:32:16)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.5a1`
+- pytorch version: `pytorch 1.8.1`
+- Git hash: `37a5c7cdb84b1d2361f4a4fa08826b2873bf7753`
+  - Commit date: `Thu Nov 25 05:30:02 2021 +0000`
+- Pretrained model: https://huggingface.co/espnet/simpleoier_librispeech_asr_train_asr_conformer7_wavlm_large_raw_en_bpe5000_sp
+
+### WER
+
 |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
 |---|---|---|---|---|---|---|---|---|
-|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_17epoch_asr_model_valid.acc.best/dev_clean|2703|68010|98.2|1.3|0.5|0.4|2.2|22.1|
-|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_17epoch_asr_model_valid.acc.best/dev_other|2864|63110|96.0|2.8|1.2|0.6|4.6|33.7|
-|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_17epoch_asr_model_valid.acc.best/test_clean|2620|65818|98.1|1.3|0.6|0.4|2.3|22.1|
-|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_17epoch_asr_model_valid.acc.best/test_other|2939|65101|96.0|2.7|1.3|0.6|4.6|36.0|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/dev_clean|2703|54402|98.4|1.4|0.1|0.2|1.7|23.1|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/dev_other|2864|50948|96.7|3.0|0.3|0.3|3.6|35.5|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_clean|2620|52576|98.4|1.5|0.1|0.2|1.8|23.7|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_other|2939|52343|96.7|3.0|0.3|0.4|3.7|37.9|
 
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/dev_clean|2703|288456|99.7|0.2|0.2|0.2|0.5|23.1|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/dev_other|2864|265951|98.9|0.6|0.4|0.4|1.5|35.5|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_clean|2620|281530|99.6|0.2|0.2|0.2|0.6|23.7|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_other|2939|272758|99.1|0.5|0.4|0.4|1.3|37.9|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/dev_clean|2703|68010|98.2|1.4|0.4|0.3|2.1|23.1|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/dev_other|2864|63110|96.0|3.1|0.9|0.9|4.9|35.5|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_clean|2620|65818|98.1|1.4|0.5|0.4|2.3|23.7|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_other|2939|65101|96.1|2.9|1.0|0.8|4.7|37.9|
 
 # Self-supervised learning features [Wav2Vec2_large_960hr, Conformer, utt_mvn](conf/tuning/train_asr_conformer7_wav2vec2_960hr_large.yaml) with [Transformer-LM](conf/tuning/train_lm_transformer2.yaml)
 
 ## Environments
-- date: `Sat Jul  3 23:10:19 JST 2021`
-- python version: `3.7.9 (default, Apr 23 2021, 13:48:31)  [GCC 5.5.0 20171010]`
-- espnet version: `espnet 0.9.9`
-- pytorch version: `pytorch 1.7.0`
-- Git hash: `0f7558a716ab830d0c29da8785840124f358d47b`
-  - Commit date: `Tue Jun 8 15:33:49 2021 -0400`
-- Pretrained model: https://huggingface.co/espnet/xuankai_chang_librispeech_asr_train_asr_conformer7_wav2vec2_960hr_large_raw_en_bpe5000_sp_25epoch
+- date: `Thu Dec 16 23:20:01 EST 2021`
+- python version: `3.7.11 (default, Jul 27 2021, 14:32:16)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.5a1`
+- pytorch version: `pytorch 1.8.1`
+- Git hash: `37a5c7cdb84b1d2361f4a4fa08826b2873bf7753`
+  - Commit date: `Thu Nov 25 05:30:02 2021 +0000`
+- Pretrained model: https://huggingface.co/espnet/simpleoier_librispeech_asr_train_asr_conformer7_wav2vec2_960hr_large_raw_en_bpe5000_sp
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/dev_clean|2703|54402|98.3|1.6|0.2|0.2|1.9|24.6|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/dev_other|2864|50948|95.2|4.3|0.5|0.5|5.2|42.5|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_clean|2620|52576|98.1|1.6|0.2|0.2|2.1|25.7|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_other|2939|52343|95.4|4.1|0.5|0.5|5.1|45.1|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/dev_clean|2703|288456|99.6|0.2|0.2|0.2|0.6|24.6|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/dev_other|2864|265951|98.3|1.0|0.7|0.6|2.3|42.5|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_clean|2620|281530|99.5|0.2|0.3|0.2|0.7|25.7|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_other|2939|272758|98.4|0.8|0.7|0.6|2.1|45.1|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/dev_clean|2703|68010|98.0|1.5|0.5|0.4|2.4|24.6|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/dev_other|2864|63110|94.4|4.3|1.3|1.2|6.8|42.5|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_clean|2620|65818|97.7|1.6|0.7|0.4|2.7|25.7|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_other|2939|65101|94.5|3.9|1.5|1.0|6.4|45.1|
+
+
+# Conformer, `hop_length=160`
+- Params: 116.15 M
+- ASR config: [conf/tuning/train_asr_conformer10_hop_length160.yaml](conf/tuning/train_asr_conformer10_hop_length160.yaml)
+- LM config: [conf/tuning/train_lm_transformer2.yaml](conf/tuning/train_lm_transformer2.yaml)
+- Pretrained model: [https://huggingface.co/pyf98/librispeech_conformer_hop_length160](https://huggingface.co/pyf98/librispeech_conformer_hop_length160)
+
+# RESULTS
+## Environments
+- date: `Mon Mar 14 12:26:10 EDT 2022`
+- python version: `3.9.7 (default, Sep 16 2021, 13:09:58)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.10.1`
+- Git hash: `467660021998c416ac366aed0f75f3399e321a3a`
+  - Commit date: `Sun Mar 13 17:08:56 2022 -0400`
+
+## asr_train_asr_conformer10_hop_length160_raw_en_bpe5000_sp
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|beam60_ctc0.3/dev_clean|2703|54402|98.1|1.7|0.2|0.2|2.1|27.7|
+|beam60_ctc0.3/dev_other|2864|50948|95.3|4.3|0.4|0.5|5.2|44.1|
+|beam60_ctc0.3/test_clean|2620|52576|97.9|1.9|0.2|0.3|2.4|27.9|
+|beam60_ctc0.3/test_other|2939|52343|95.4|4.1|0.4|0.6|5.2|44.8|
+|beam60_ctc0.3_lm0.6/dev_clean|2703|54402|98.4|1.4|0.2|0.2|1.8|23.3|
+|beam60_ctc0.3_lm0.6/dev_other|2864|50948|96.4|3.2|0.4|0.4|3.9|36.2|
+|beam60_ctc0.3_lm0.6/test_clean|2620|52576|98.3|1.5|0.2|0.2|2.0|23.7|
+|beam60_ctc0.3_lm0.6/test_other|2939|52343|96.2|3.3|0.4|0.5|4.2|39.6|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|beam60_ctc0.3/dev_clean|2703|288456|99.5|0.3|0.2|0.2|0.7|27.7|
+|beam60_ctc0.3/dev_other|2864|265951|98.4|1.0|0.6|0.6|2.2|44.1|
+|beam60_ctc0.3/test_clean|2620|281530|99.4|0.3|0.3|0.2|0.8|27.9|
+|beam60_ctc0.3/test_other|2939|272758|98.5|0.9|0.7|0.6|2.1|44.8|
+|beam60_ctc0.3_lm0.6/dev_clean|2703|288456|99.5|0.2|0.2|0.2|0.6|23.3|
+|beam60_ctc0.3_lm0.6/dev_other|2864|265951|98.5|0.8|0.6|0.5|1.9|36.2|
+|beam60_ctc0.3_lm0.6/test_clean|2620|281530|99.5|0.2|0.3|0.2|0.7|23.7|
+|beam60_ctc0.3_lm0.6/test_other|2939|272758|98.6|0.7|0.7|0.5|1.9|39.6|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|beam60_ctc0.3/dev_clean|2703|68010|97.6|1.7|0.6|0.4|2.7|27.7|
+|beam60_ctc0.3/dev_other|2864|63110|94.2|4.3|1.5|0.9|6.7|44.1|
+|beam60_ctc0.3/test_clean|2620|65818|97.4|1.8|0.8|0.4|3.0|27.9|
+|beam60_ctc0.3/test_other|2939|65101|94.4|3.9|1.7|0.8|6.4|44.8|
+|beam60_ctc0.3_lm0.6/dev_clean|2703|68010|98.0|1.4|0.6|0.3|2.3|23.3|
+|beam60_ctc0.3_lm0.6/dev_other|2864|63110|95.2|3.4|1.4|0.6|5.5|36.2|
+|beam60_ctc0.3_lm0.6/test_clean|2620|65818|97.8|1.4|0.8|0.3|2.5|23.7|
+|beam60_ctc0.3_lm0.6/test_other|2939|65101|95.1|3.2|1.7|0.6|5.5|39.6|
+
+
+
+# Conformer, using stochastic depth
+- Params: 116.15M
+- ASR config [conf/tuning/train_asr_conformer9_layerdrop0.1_last6.yaml](conf/tuning/train_asr_conformer9_layerdrop0.1_last6.yaml)
+- LM config: [conf/tuning/train_lm_transformer2.yaml](conf/tuning/train_lm_transformer2.yaml)
+- Pretrained model: [https://huggingface.co/pyf98/librispeech_conformer_layerdrop0.1_last6](https://huggingface.co/pyf98/librispeech_conformer_layerdrop0.1_last6)
+
+# RESULTS
+## Environments
+- date: `Mon Mar  7 12:21:40 EST 2022`
+- python version: `3.9.7 (default, Sep 16 2021, 13:09:58)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.10.1`
+- Git hash: `c3569453a408fd4ff4173d9c1d2062c88d1fc060`
+  - Commit date: `Sun Mar 6 23:58:36 2022 -0500`
+
+## asr_train_asr_conformer9_layerdrop0.1_last6_raw_en_bpe5000_sp
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|beam60_ctc0.3/dev_clean|2703|54402|98.1|1.8|0.2|0.2|2.1|26.6|
+|beam60_ctc0.3/dev_other|2864|50948|95.4|4.2|0.4|0.5|5.1|43.3|
+|beam60_ctc0.3/test_clean|2620|52576|97.9|1.9|0.2|0.3|2.4|28.1|
+|beam60_ctc0.3/test_other|2939|52343|95.3|4.3|0.4|0.7|5.4|45.7|
+|beam60_ctc0.3_lm0.6/dev_clean|2703|54402|98.4|1.4|0.2|0.2|1.8|23.3|
+|beam60_ctc0.3_lm0.6/dev_other|2864|50948|96.4|3.2|0.4|0.4|4.0|36.5|
+|beam60_ctc0.3_lm0.6/test_clean|2620|52576|98.2|1.6|0.2|0.2|2.0|23.7|
+|beam60_ctc0.3_lm0.6/test_other|2939|52343|96.2|3.4|0.5|0.5|4.3|40.4|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|beam60_ctc0.3/dev_clean|2703|288456|99.5|0.3|0.2|0.2|0.7|26.6|
+|beam60_ctc0.3/dev_other|2864|265951|98.3|1.0|0.7|0.6|2.3|43.3|
+|beam60_ctc0.3/test_clean|2620|281530|99.5|0.3|0.3|0.2|0.8|28.1|
+|beam60_ctc0.3/test_other|2939|272758|98.4|1.0|0.7|0.6|2.3|45.7|
+|beam60_ctc0.3_lm0.6/dev_clean|2703|288456|99.5|0.3|0.3|0.2|0.7|23.3|
+|beam60_ctc0.3_lm0.6/dev_other|2864|265951|98.5|0.8|0.7|0.5|1.9|36.5|
+|beam60_ctc0.3_lm0.6/test_clean|2620|281530|99.5|0.2|0.3|0.2|0.7|23.7|
+|beam60_ctc0.3_lm0.6/test_other|2939|272758|98.5|0.7|0.7|0.5|2.0|40.4|
+
+### TER
 
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|beam60_ctc0.3/dev_clean|2703|68010|97.6|1.7|0.7|0.3|2.7|26.6|
+|beam60_ctc0.3/dev_other|2864|63110|94.2|4.3|1.5|0.8|6.6|43.3|
+|beam60_ctc0.3/test_clean|2620|65818|97.4|1.8|0.8|0.3|2.9|28.1|
+|beam60_ctc0.3/test_other|2939|65101|94.2|4.1|1.7|0.8|6.6|45.7|
+|beam60_ctc0.3_lm0.6/dev_clean|2703|68010|97.9|1.4|0.7|0.3|2.4|23.3|
+|beam60_ctc0.3_lm0.6/dev_other|2864|63110|95.2|3.4|1.5|0.6|5.5|36.5|
+|beam60_ctc0.3_lm0.6/test_clean|2620|65818|97.7|1.5|0.8|0.3|2.6|23.7|
+|beam60_ctc0.3_lm0.6/test_other|2939|65101|95.0|3.2|1.8|0.6|5.6|40.4|
+
+
+
+# Conformer, new SpecAug, using weight decay in Adam
+- Params: 116.15M
+- ASR config [conf/tuning/train_asr_conformer8.yaml](conf/tuning/train_asr_conformer8.yaml)
+- LM config: [conf/tuning/train_lm_transformer2.yaml](conf/tuning/train_lm_transformer2.yaml)
+- Pretrained model: [https://huggingface.co/pyf98/librispeech_conformer](https://huggingface.co/pyf98/librispeech_conformer)
+
+# RESULTS
+## Environments
+- date: `Mon Mar  7 12:26:10 EST 2022`
+- python version: `3.9.7 (default, Sep 16 2021, 13:09:58)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.10.1`
+- Git hash: `c3569453a408fd4ff4173d9c1d2062c88d1fc060`
+  - Commit date: `Sun Mar 6 23:58:36 2022 -0500`
+
+## asr_train_asr_conformer8_raw_en_bpe5000_sp
 ### WER
+
 |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
 |---|---|---|---|---|---|---|---|---|
-|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_17epoch_asr_model_valid.acc.best/dev_clean|2703|54402|98.3|1.6|0.2|0.2|1.9|24.9|
-|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_17epoch_asr_model_valid.acc.best/dev_other|2864|50948|95.1|4.3|0.6|0.4|5.4|42.8|
-|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_17epoch_asr_model_valid.acc.best/test_clean|2620|52576|98.1|1.7|0.2|0.2|2.2|26.8|
-|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_17epoch_asr_model_valid.acc.best/test_other|2939|52343|95.3|4.1|0.6|0.5|5.2|45.8|
+|beam60_ctc0.3/dev_clean|2703|54402|98.1|1.8|0.2|0.2|2.1|27.3|
+|beam60_ctc0.3/dev_other|2864|50948|95.2|4.4|0.4|0.5|5.4|43.7|
+|beam60_ctc0.3/test_clean|2620|52576|97.9|1.9|0.2|0.3|2.3|29.0|
+|beam60_ctc0.3/test_other|2939|52343|95.2|4.3|0.4|0.6|5.4|45.7|
+|beam60_ctc0.3_lm0.6/dev_clean|2703|54402|98.4|1.4|0.2|0.2|1.8|23.5|
+|beam60_ctc0.3_lm0.6/dev_other|2864|50948|96.2|3.4|0.4|0.4|4.1|37.4|
+|beam60_ctc0.3_lm0.6/test_clean|2620|52576|98.3|1.5|0.2|0.2|1.9|24.1|
+|beam60_ctc0.3_lm0.6/test_other|2939|52343|96.2|3.3|0.5|0.5|4.3|39.9|
 
 ### CER
+
 |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
 |---|---|---|---|---|---|---|---|---|
-|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_17epoch_asr_model_valid.acc.best/dev_clean|2703|288456|99.5|0.2|0.2|0.2|0.6|24.9|
-|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_17epoch_asr_model_valid.acc.best/dev_other|2864|265951|98.1|1.0|0.9|0.5|2.4|42.8|
-|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_17epoch_asr_model_valid.acc.best/test_clean|2620|281530|99.5|0.2|0.3|0.2|0.7|26.8|
-|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_17epoch_asr_model_valid.acc.best/test_other|2939|272758|98.3|0.8|0.9|0.5|2.3|45.8|
+|beam60_ctc0.3/dev_clean|2703|288456|99.5|0.3|0.2|0.2|0.7|27.3|
+|beam60_ctc0.3/dev_other|2864|265951|98.2|1.1|0.7|0.6|2.4|43.7|
+|beam60_ctc0.3/test_clean|2620|281530|99.4|0.3|0.3|0.2|0.8|29.0|
+|beam60_ctc0.3/test_other|2939|272758|98.4|0.9|0.7|0.6|2.2|45.7|
+|beam60_ctc0.3_lm0.6/dev_clean|2703|288456|99.5|0.2|0.2|0.2|0.7|23.5|
+|beam60_ctc0.3_lm0.6/dev_other|2864|265951|98.5|0.9|0.7|0.5|2.0|37.4|
+|beam60_ctc0.3_lm0.6/test_clean|2620|281530|99.5|0.2|0.3|0.2|0.7|24.1|
+|beam60_ctc0.3_lm0.6/test_other|2939|272758|98.6|0.7|0.7|0.5|1.9|39.9|
 
 ### TER
+
 |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
 |---|---|---|---|---|---|---|---|---|
-|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_17epoch_asr_model_valid.acc.best/dev_clean|2703|68010|97.8|1.6|0.6|0.4|2.6|24.9|
-|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_17epoch_asr_model_valid.acc.best/dev_other|2864|63110|94.1|4.3|1.6|1.1|7.0|42.8|
-|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_17epoch_asr_model_valid.acc.best/test_clean|2620|65818|97.6|1.6|0.8|0.4|2.8|26.8|
-|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_17epoch_asr_model_valid.acc.best/test_other|2939|65101|94.3|4.0|1.8|1.0|6.7|45.8|
+|beam60_ctc0.3/dev_clean|2703|68010|97.6|1.8|0.7|0.3|2.8|27.3|
+|beam60_ctc0.3/dev_other|2864|63110|94.1|4.4|1.5|0.9|6.8|43.7|
+|beam60_ctc0.3/test_clean|2620|65818|97.4|1.8|0.7|0.3|2.9|29.0|
+|beam60_ctc0.3/test_other|2939|65101|94.2|4.1|1.7|0.8|6.6|45.7|
+|beam60_ctc0.3_lm0.6/dev_clean|2703|68010|97.9|1.5|0.7|0.3|2.4|23.5|
+|beam60_ctc0.3_lm0.6/dev_other|2864|63110|95.1|3.5|1.4|0.6|5.6|37.4|
+|beam60_ctc0.3_lm0.6/test_clean|2620|65818|97.7|1.5|0.8|0.3|2.5|24.1|
+|beam60_ctc0.3_lm0.6/test_other|2939|65101|95.1|3.2|1.7|0.6|5.5|39.9|
+
 
 
 # Tuning warmup_steps
diff --git a/egs2/librispeech/asr1/conf/decode_asr.yaml b/egs2/librispeech/asr1/conf/decode_asr.yaml
index fe5290e82d1..7b44351b5f2 100644
--- a/egs2/librispeech/asr1/conf/decode_asr.yaml
+++ b/egs2/librispeech/asr1/conf/decode_asr.yaml
@@ -1,3 +1,3 @@
-lm_weight: 0.6
-ctc_weight: 0.4
 beam_size: 60
+ctc_weight: 0.3
+lm_weight: 0.6
diff --git a/egs2/librispeech/asr1/conf/decode_rnnt_conformer.yaml b/egs2/librispeech/asr1/conf/decode_rnnt_conformer.yaml
new file mode 100644
index 00000000000..9ca18d6d4e2
--- /dev/null
+++ b/egs2/librispeech/asr1/conf/decode_rnnt_conformer.yaml
@@ -0,0 +1,5 @@
+# The conformer transducer decoding configuration from @jeon30c
+beam_size: 10
+transducer_conf:
+    search_type: default
+    score_norm: True
diff --git a/egs2/librispeech/asr1/conf/train_asr_confformer.yaml b/egs2/librispeech/asr1/conf/train_asr_confformer.yaml
deleted file mode 120000
index 2b1e07638c8..00000000000
--- a/egs2/librispeech/asr1/conf/train_asr_confformer.yaml
+++ /dev/null
@@ -1 +0,0 @@
-tuning/train_asr_conformer6_n_fft512_hop_length256.yaml
\ No newline at end of file
diff --git a/egs2/librispeech/asr1/conf/train_asr_conformer.yaml b/egs2/librispeech/asr1/conf/train_asr_conformer.yaml
new file mode 120000
index 00000000000..11b013a3089
--- /dev/null
+++ b/egs2/librispeech/asr1/conf/train_asr_conformer.yaml
@@ -0,0 +1 @@
+tuning/train_asr_conformer10_hop_length160.yaml
\ No newline at end of file
diff --git a/egs2/librispeech/asr1/conf/train_rnnt_conformer.yaml b/egs2/librispeech/asr1/conf/train_rnnt_conformer.yaml
new file mode 100644
index 00000000000..5615e6fa600
--- /dev/null
+++ b/egs2/librispeech/asr1/conf/train_rnnt_conformer.yaml
@@ -0,0 +1,75 @@
+# The conformer transducer training configuration from @jeon30c
+# WERs for test-clean/test-other are 2.9 and 7.2, respectively.
+# Trained with Tesla V100-SXM2(32GB) x 8 GPUs. It takes about 1.5 days.
+batch_type: numel
+batch_bins: 20000000
+accum_grad: 2
+max_epoch: 100
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 10
+
+model_conf:
+    ctc_weight: 0.0
+    report_cer: False
+    report_wer: False
+
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+decoder: transducer
+decoder_conf:
+    rnn_type: lstm
+    num_layers: 1
+    hidden_size: 512
+    dropout: 0.1
+    dropout_embed: 0.1
+
+joint_net_conf:
+    joint_space_size: 640
+
+optim: adam
+optim_conf:
+    lr: 0.0015
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 25000
+
+frontend_conf:
+  n_fft: 512
+  hop_length: 160 
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/librispeech/asr1/conf/tuning/train_asr_conformer10_hop_length160.yaml b/egs2/librispeech/asr1/conf/tuning/train_asr_conformer10_hop_length160.yaml
new file mode 100644
index 00000000000..76094f0c4a9
--- /dev/null
+++ b/egs2/librispeech/asr1/conf/tuning/train_asr_conformer10_hop_length160.yaml
@@ -0,0 +1,76 @@
+# Trained with Tesla V100 (32GB) x 4 GPUs. It takes about 3.5 days.
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    rel_pos_type: latest
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    use_cnn_module: true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+frontend_conf:
+    n_fft: 512
+    hop_length: 160
+
+use_amp: true
+num_workers: 4
+batch_type: numel
+batch_bins: 35000000
+accum_grad: 4
+max_epoch: 50
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+    lr: 0.0025
+    weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 40000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 27
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_ratio_range:
+    - 0.
+    - 0.05
+    num_time_mask: 10
diff --git a/egs2/librispeech/asr1/conf/tuning/train_asr_conformer7_hubert_960hr_large.yaml b/egs2/librispeech/asr1/conf/tuning/train_asr_conformer7_hubert_ll60k_large.yaml
similarity index 90%
rename from egs2/librispeech/asr1/conf/tuning/train_asr_conformer7_hubert_960hr_large.yaml
rename to egs2/librispeech/asr1/conf/tuning/train_asr_conformer7_hubert_ll60k_large.yaml
index 0249fbb9d5f..3dfc1337030 100644
--- a/egs2/librispeech/asr1/conf/tuning/train_asr_conformer7_hubert_960hr_large.yaml
+++ b/egs2/librispeech/asr1/conf/tuning/train_asr_conformer7_hubert_ll60k_large.yaml
@@ -1,8 +1,8 @@
-# Trained with Tesla V100-SXM2(32GB) x 8 GPUs. It takes about 1.5 days.
+# Trained with Ampere A6000(48GB) x 2 GPUs. It takes about 10 days.
 batch_type: numel
-batch_bins: 100000000
-accum_grad: 1
-max_epoch: 28
+batch_bins: 40000000
+accum_grad: 3
+max_epoch: 35
 patience: none
 init: none
 best_model_criterion:
@@ -10,15 +10,11 @@ best_model_criterion:
     - acc
     - max
 keep_nbest_models: 10
-
+unused_parameters: true
 freeze_param: [
 "frontend.upstream"
 ]
 
-frontend_conf:
-  n_fft: 512
-  hop_length: 256
-
 frontend: s3prl
 frontend_conf:
     frontend_conf:
@@ -40,7 +36,7 @@ encoder_conf:
     dropout_rate: 0.1
     positional_dropout_rate: 0.1
     attention_dropout_rate: 0.1
-    input_layer: conv2d
+    input_layer: conv2d2
     normalize_before: true
     macaron_style: true
     pos_enc_layer_type: "rel_pos"
diff --git a/egs2/librispeech/asr1/conf/tuning/train_asr_conformer7_wav2vec2_960hr_large.yaml b/egs2/librispeech/asr1/conf/tuning/train_asr_conformer7_wav2vec2_960hr_large.yaml
index 9bced014183..0354be5d931 100644
--- a/egs2/librispeech/asr1/conf/tuning/train_asr_conformer7_wav2vec2_960hr_large.yaml
+++ b/egs2/librispeech/asr1/conf/tuning/train_asr_conformer7_wav2vec2_960hr_large.yaml
@@ -1,8 +1,8 @@
-# Trained with Tesla V100-SXM2(32GB) x 8 GPUs. It takes about 1.5 days.
+# Trained with Ampere A6000(48GB) x 2 GPUs. It takes about 10 days.
 batch_type: numel
-batch_bins: 120000000
-accum_grad: 1
-max_epoch: 30
+batch_bins: 40000000
+accum_grad: 3
+max_epoch: 35
 patience: none
 init: none
 best_model_criterion:
@@ -10,7 +10,7 @@ best_model_criterion:
     - acc
     - max
 keep_nbest_models: 10
-
+unused_parameters: true
 freeze_param: [
 "frontend.upstream"
 ]
@@ -36,7 +36,7 @@ encoder_conf:
     dropout_rate: 0.1
     positional_dropout_rate: 0.1
     attention_dropout_rate: 0.1
-    input_layer: conv2d
+    input_layer: conv2d2
     normalize_before: true
     macaron_style: true
     pos_enc_layer_type: "rel_pos"
diff --git a/egs2/librispeech/asr1/conf/tuning/train_asr_conformer7_wavlm_large.yaml b/egs2/librispeech/asr1/conf/tuning/train_asr_conformer7_wavlm_large.yaml
new file mode 100644
index 00000000000..d89f9620cc2
--- /dev/null
+++ b/egs2/librispeech/asr1/conf/tuning/train_asr_conformer7_wavlm_large.yaml
@@ -0,0 +1,85 @@
+# Trained with Ampere A6000(48GB) x 2 GPUs. It takes about 10 days.
+batch_type: numel
+batch_bins: 40000000
+accum_grad: 3
+max_epoch: 35
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+unused_parameters: true
+freeze_param: [
+"frontend.upstream"
+]
+
+frontend: s3prl
+frontend_conf:
+    frontend_conf:
+        upstream: wavlm_large  # Note: If the upstream is changed, please change the input_size in the preencoder.
+    download_dir: ./hub
+    multilayer_feature: True
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 80
+
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d2
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+optim: adam
+optim_conf:
+    lr: 0.0025
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 40000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/librispeech/asr1/conf/tuning/train_asr_conformer8.yaml b/egs2/librispeech/asr1/conf/tuning/train_asr_conformer8.yaml
new file mode 100644
index 00000000000..5ff37537086
--- /dev/null
+++ b/egs2/librispeech/asr1/conf/tuning/train_asr_conformer8.yaml
@@ -0,0 +1,76 @@
+# Trained with Tesla V100 (32GB) x 3 GPUs. It takes about 3 days.
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    rel_pos_type: latest
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    use_cnn_module: true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+frontend_conf:
+    n_fft: 512
+    hop_length: 256
+
+use_amp: true
+num_workers: 4
+batch_type: numel
+batch_bins: 35000000
+accum_grad: 4
+max_epoch: 50
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+    lr: 0.0025
+    weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 40000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 27
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_ratio_range:
+    - 0.
+    - 0.05
+    num_time_mask: 10
diff --git a/egs2/librispeech/asr1/conf/tuning/train_asr_conformer9_layerdrop0.1_last6.yaml b/egs2/librispeech/asr1/conf/tuning/train_asr_conformer9_layerdrop0.1_last6.yaml
new file mode 100644
index 00000000000..bb89c68826b
--- /dev/null
+++ b/egs2/librispeech/asr1/conf/tuning/train_asr_conformer9_layerdrop0.1_last6.yaml
@@ -0,0 +1,90 @@
+# Trained with Tesla V100 (32GB) x 3 GPUs. It takes about 3 days.
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    rel_pos_type: latest
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    use_cnn_module: true
+    cnn_module_kernel: 31
+    stochastic_depth_rate:
+    - 0.0
+    - 0.0
+    - 0.0
+    - 0.0
+    - 0.0
+    - 0.0
+    - 0.1
+    - 0.1
+    - 0.1
+    - 0.1
+    - 0.1
+    - 0.1
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+frontend_conf:
+    n_fft: 512
+    hop_length: 256
+
+unused_parameters: true         # due to layer dropout, some layers are not used
+use_amp: true
+num_workers: 4
+batch_type: numel
+batch_bins: 35000000
+accum_grad: 4
+max_epoch: 50
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+    lr: 0.0025
+    weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 40000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 27
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_ratio_range:
+    - 0.
+    - 0.05
+    num_time_mask: 10
diff --git a/egs2/librispeech/asr1/run.sh b/egs2/librispeech/asr1/run.sh
index 7035051a859..4a457e86a7d 100755
--- a/egs2/librispeech/asr1/run.sh
+++ b/egs2/librispeech/asr1/run.sh
@@ -9,13 +9,13 @@ train_set="train_960"
 valid_set="dev"
 test_sets="test_clean test_other dev_clean dev_other"
 
-asr_config=conf/tuning/train_asr_conformer7_n_fft512_hop_length256.yaml
+asr_config=conf/train_asr_conformer.yaml
 lm_config=conf/tuning/train_lm_transformer2.yaml
 inference_config=conf/decode_asr.yaml
 
 ./asr.sh \
     --lang en \
-    --ngpu 16 \
+    --ngpu 4 \
     --nbpe 5000 \
     --max_wav_duration 30 \
     --speed_perturb_factors "0.9 1.0 1.1" \
diff --git a/egs2/librispeech_100/asr1/README.md b/egs2/librispeech_100/asr1/README.md
new file mode 100644
index 00000000000..910d5106377
--- /dev/null
+++ b/egs2/librispeech_100/asr1/README.md
@@ -0,0 +1,102 @@
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+# RESULTS
+## Environments
+- date: `Mon Feb  7 21:28:00 EST 2022`
+- python version: `3.9.7 (default, Sep 16 2021, 13:09:58)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.6a1`
+- pytorch version: `pytorch 1.10.1`
+- Git hash: `060fdb8b231b980c67b88a00fb8dd644aebbb1c0`
+  - Commit date: `Mon Feb 7 21:26:51 2022 -0500`
+
+## asr_conformer_win400_hop160_ctc0.3_lr2e-3_warmup15k_timemask5_amp_no-deterministic
+
+GPU: a single V100-32GB
+
+Training Time: 57072 seconds
+
+Model: https://huggingface.co/pyf98/librispeech_100h_conformer
+
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|beam1_ctc0.3/dev_clean|2703|54402|93.6|5.3|1.1|1.5|8.0|58.5|
+|beam1_ctc0.3/dev_other|2864|50948|83.7|14.3|2.0|3.2|19.5|81.2|
+|beam1_ctc0.3/test_clean|2620|52576|93.3|5.6|1.1|1.7|8.4|59.4|
+|beam1_ctc0.3/test_other|2939|52343|83.5|14.4|2.1|2.9|19.4|83.3|
+|beam20_ctc0.3/dev_clean|2703|54402|94.5|5.1|0.4|0.8|6.3|56.3|
+|beam20_ctc0.3/dev_other|2864|50948|84.6|13.9|1.5|2.1|17.4|79.9|
+|beam20_ctc0.3/test_clean|2620|52576|94.3|5.3|0.4|0.8|6.5|57.0|
+|beam20_ctc0.3/test_other|2939|52343|84.7|13.7|1.6|2.0|17.3|81.6|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|beam1_ctc0.3/dev_clean|2703|288456|97.4|1.2|1.4|1.4|4.0|58.5|
+|beam1_ctc0.3/dev_other|2864|265951|92.5|4.5|3.0|3.2|10.7|81.2|
+|beam1_ctc0.3/test_clean|2620|281530|97.3|1.2|1.5|1.5|4.2|59.4|
+|beam1_ctc0.3/test_other|2939|272758|92.6|4.3|3.1|2.9|10.3|83.3|
+|beam20_ctc0.3/dev_clean|2703|288456|98.2|1.1|0.7|0.7|2.5|56.3|
+|beam20_ctc0.3/dev_other|2864|265951|93.3|4.2|2.5|2.0|8.7|79.9|
+|beam20_ctc0.3/test_clean|2620|281530|98.1|1.1|0.8|0.6|2.5|57.0|
+|beam20_ctc0.3/test_other|2939|272758|93.5|4.0|2.6|1.9|8.4|81.6|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|beam1_ctc0.3/dev_clean|2703|69558|91.0|5.5|3.5|1.4|10.4|58.5|
+|beam1_ctc0.3/dev_other|2864|64524|80.2|14.7|5.1|4.2|24.0|81.2|
+|beam1_ctc0.3/test_clean|2620|66983|91.0|5.6|3.4|1.6|10.6|59.4|
+|beam1_ctc0.3/test_other|2939|66650|80.0|14.4|5.6|3.7|23.7|83.3|
+|beam20_ctc0.3/dev_clean|2703|69558|91.9|5.0|3.1|0.6|8.7|56.3|
+|beam20_ctc0.3/dev_other|2864|64524|81.0|13.5|5.5|2.3|21.3|79.9|
+|beam20_ctc0.3/test_clean|2620|66983|92.0|5.0|3.0|0.6|8.6|57.0|
+|beam20_ctc0.3/test_other|2939|66650|81.2|13.0|5.8|2.0|20.9|81.6|
+
+
+
+## Environments
+- date: `Fri Feb 18 16:00:45 EST 2022`
+- python version: `3.9.7 (default, Sep 16 2021, 13:09:58)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.10.1`
+- Git hash: `f6779876103be2116de158a44757f8979eff0ab0`
+  - Commit date: `Fri Feb 18 15:57:13 2022 -0500`
+
+## asr_transformer_win400_hop160_ctc0.3_lr2e-3_warmup15k_timemask5_amp_no-deterministic
+
+GPU: a single V100-32GB
+
+Training Time: 42834 seconds
+
+Model: https://huggingface.co/pyf98/librispeech_100h_transformer
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|beam20_ctc0.3/dev_clean|2703|54402|93.0|6.4|0.5|1.1|8.1|63.1|
+|beam20_ctc0.3/dev_other|2864|50948|82.5|15.9|1.6|2.7|20.2|83.8|
+|beam20_ctc0.3/test_clean|2620|52576|92.8|6.5|0.7|1.2|8.4|63.3|
+|beam20_ctc0.3/test_other|2939|52343|82.1|16.0|1.9|2.6|20.5|84.8|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|beam20_ctc0.3/dev_clean|2703|288456|97.5|1.4|1.1|0.9|3.4|63.1|
+|beam20_ctc0.3/dev_other|2864|265951|92.1|4.8|3.1|2.4|10.3|83.8|
+|beam20_ctc0.3/test_clean|2620|281530|97.4|1.4|1.2|0.9|3.5|63.3|
+|beam20_ctc0.3/test_other|2939|272758|92.0|4.7|3.2|2.3|10.2|84.8|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|beam20_ctc0.3/dev_clean|2703|69558|89.9|6.1|4.0|0.8|10.9|63.1|
+|beam20_ctc0.3/dev_other|2864|64524|78.5|15.3|6.2|2.8|24.3|83.8|
+|beam20_ctc0.3/test_clean|2620|66983|90.0|6.2|3.9|0.8|10.9|63.3|
+|beam20_ctc0.3/test_other|2939|66650|77.9|15.2|6.9|2.5|24.6|84.8|
diff --git a/egs2/librispeech_100/asr1/asr.sh b/egs2/librispeech_100/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/librispeech_100/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/librispeech_100/asr1/cmd.sh b/egs2/librispeech_100/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/librispeech_100/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/librispeech_100/asr1/conf/decode_asr.yaml b/egs2/librispeech_100/asr1/conf/decode_asr.yaml
new file mode 100644
index 00000000000..a80fbb67f0d
--- /dev/null
+++ b/egs2/librispeech_100/asr1/conf/decode_asr.yaml
@@ -0,0 +1,6 @@
+beam_size: 20
+ctc_weight: 0.3
+lm_weight: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+penalty: 0.0
diff --git a/egs2/librispeech_100/asr1/conf/fbank.conf b/egs2/librispeech_100/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/librispeech_100/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/librispeech_100/asr1/conf/pbs.conf b/egs2/librispeech_100/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/librispeech_100/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/librispeech_100/asr1/conf/pitch.conf b/egs2/librispeech_100/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/librispeech_100/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/librispeech_100/asr1/conf/queue.conf b/egs2/librispeech_100/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/librispeech_100/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/librispeech_100/asr1/conf/slurm.conf b/egs2/librispeech_100/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/librispeech_100/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/librispeech_100/asr1/conf/train_asr.yaml b/egs2/librispeech_100/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..ca04d1f7e8a
--- /dev/null
+++ b/egs2/librispeech_100/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+tuning/train_asr_conformer_lr2e-3_warmup15k_amp_nondeterministic.yaml
\ No newline at end of file
diff --git a/egs2/librispeech_100/asr1/conf/tuning/decode_ctc_bs1.yaml b/egs2/librispeech_100/asr1/conf/tuning/decode_ctc_bs1.yaml
new file mode 100644
index 00000000000..d1a6b48b430
--- /dev/null
+++ b/egs2/librispeech_100/asr1/conf/tuning/decode_ctc_bs1.yaml
@@ -0,0 +1,3 @@
+lm_weight: 0.0
+ctc_weight: 1.0
+beam_size: 1
diff --git a/egs2/librispeech_100/asr1/conf/tuning/train_asr_conformer_lr2e-3_warmup15k_amp_nondeterministic.yaml b/egs2/librispeech_100/asr1/conf/tuning/train_asr_conformer_lr2e-3_warmup15k_amp_nondeterministic.yaml
new file mode 100644
index 00000000000..6b2da79b3d4
--- /dev/null
+++ b/egs2/librispeech_100/asr1/conf/tuning/train_asr_conformer_lr2e-3_warmup15k_amp_nondeterministic.yaml
@@ -0,0 +1,84 @@
+encoder: conformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    rel_pos_type: latest
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    use_cnn_module: true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+frontend_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 160
+
+seed: 2022
+log_interval: 400   
+num_att_plot: 0     
+num_workers: 4      
+sort_in_batch: descending       # how to sort data in making batch
+sort_batch: descending          # how to sort created batches
+batch_type: numel
+batch_bins: 16000000
+accum_grad: 4
+max_epoch: 70
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+use_amp: true      
+cudnn_deterministic: false  
+cudnn_benchmark: false      
+
+optim: adam
+optim_conf:
+    lr: 0.002
+    weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 15000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 27
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_ratio_range:
+    - 0.
+    - 0.05
+    num_time_mask: 5
diff --git a/egs2/librispeech_100/asr1/conf/tuning/train_asr_transformer_win400_hop160_ctc0.3_lr2e-3_warmup15k_timemask5_amp_no-deterministic.yaml b/egs2/librispeech_100/asr1/conf/tuning/train_asr_transformer_win400_hop160_ctc0.3_lr2e-3_warmup15k_timemask5_amp_no-deterministic.yaml
new file mode 100644
index 00000000000..49b0050de8c
--- /dev/null
+++ b/egs2/librispeech_100/asr1/conf/tuning/train_asr_transformer_win400_hop160_ctc0.3_lr2e-3_warmup15k_timemask5_amp_no-deterministic.yaml
@@ -0,0 +1,77 @@
+encoder: transformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 18
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+frontend_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 160
+
+seed: 2022
+log_interval: 400
+num_att_plot: 0
+num_workers: 4
+sort_in_batch: descending
+sort_batch: descending
+batch_type: numel
+batch_bins: 16000000
+accum_grad: 4
+max_epoch: 70
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+use_amp: true
+cudnn_deterministic: false
+cudnn_benchmark: false
+
+optim: adam
+optim_conf:
+    lr: 0.002
+    weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 15000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 27
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_ratio_range:
+    - 0.
+    - 0.05
+    num_time_mask: 5
diff --git a/egs2/librispeech_100/asr1/conf/tuning/train_conformer_ctc.yaml b/egs2/librispeech_100/asr1/conf/tuning/train_conformer_ctc.yaml
new file mode 100644
index 00000000000..053cd5bccd2
--- /dev/null
+++ b/egs2/librispeech_100/asr1/conf/tuning/train_conformer_ctc.yaml
@@ -0,0 +1,61 @@
+batch_type: numel
+batch_bins: 2000000
+accum_grad: 16
+max_epoch: 60
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - cer_ctc
+    - min
+keep_nbest_models: 10
+
+encoder: conformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 18
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    rel_pos_type: latest
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    use_cnn_module: true
+    cnn_module_kernel: 31
+
+model_conf:
+    ctc_weight: 1.0
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.002
+    weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 15000
+
+num_att_plot: 0
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 27
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_ratio_range:
+    - 0.
+    - 0.05
+    num_time_mask: 5
\ No newline at end of file
diff --git a/egs2/librispeech_100/asr1/conf/tuning/train_conformer_interctc.yaml b/egs2/librispeech_100/asr1/conf/tuning/train_conformer_interctc.yaml
new file mode 100644
index 00000000000..01bc5d3ce75
--- /dev/null
+++ b/egs2/librispeech_100/asr1/conf/tuning/train_conformer_interctc.yaml
@@ -0,0 +1,63 @@
+batch_type: numel
+batch_bins: 2000000
+accum_grad: 16
+max_epoch: 60
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - cer_ctc
+    - min
+keep_nbest_models: 10
+
+encoder: conformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 18
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    rel_pos_type: latest
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    use_cnn_module: true
+    cnn_module_kernel: 31
+    interctc_layer_idx: [6,12]
+
+model_conf:
+    ctc_weight: 1.0
+    interctc_weight: 0.66
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.002
+    weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 15000
+
+num_att_plot: 0
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 27
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_ratio_range:
+    - 0.
+    - 0.05
+    num_time_mask: 5
\ No newline at end of file
diff --git a/egs2/librispeech_100/asr1/conf/tuning/train_conformer_scctc.yaml b/egs2/librispeech_100/asr1/conf/tuning/train_conformer_scctc.yaml
new file mode 100644
index 00000000000..202b4abdc3f
--- /dev/null
+++ b/egs2/librispeech_100/asr1/conf/tuning/train_conformer_scctc.yaml
@@ -0,0 +1,64 @@
+batch_type: numel
+batch_bins: 2000000
+accum_grad: 16
+max_epoch: 60
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - cer_ctc
+    - min
+keep_nbest_models: 10
+
+encoder: conformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 18
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    rel_pos_type: latest
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    use_cnn_module: true
+    cnn_module_kernel: 31
+    interctc_layer_idx: [6,12]
+    interctc_use_conditioning: true
+
+model_conf:
+    ctc_weight: 1.0
+    interctc_weight: 0.66
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.002
+    weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 15000
+
+num_att_plot: 0
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 27
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_ratio_range:
+    - 0.
+    - 0.05
+    num_time_mask: 5
\ No newline at end of file
diff --git a/egs2/librispeech_100/asr1/db.sh b/egs2/librispeech_100/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/librispeech_100/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/librispeech_100/asr1/local/data.sh b/egs2/librispeech_100/asr1/local/data.sh
new file mode 100755
index 00000000000..87393e084c6
--- /dev/null
+++ b/egs2/librispeech_100/asr1/local/data.sh
@@ -0,0 +1,76 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+
+stage=1
+stop_stage=100000
+data_url=www.openslr.org/resources/12
+train_dev="dev"
+
+log "$0 $*"
+. utils/parse_options.sh
+
+. ./db.sh
+. ./path.sh
+. ./cmd.sh
+
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+if [ -z "${LIBRISPEECH}" ]; then
+    log "Fill the value of 'LIBRISPEECH' of db.sh"
+    exit 1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    if [ ! -e "${LIBRISPEECH}/LibriSpeech/LICENSE.TXT" ]; then
+	echo "stage 1: Data Download to ${LIBRISPEECH}"
+	for part in dev-clean test-clean dev-other test-other train-clean-100; do
+            local/download_and_untar.sh ${LIBRISPEECH} ${data_url} ${part}
+	done
+    else
+        log "stage 1: ${LIBRISPEECH}/LibriSpeech/LICENSE.TXT is already existing. Skip data downloading"
+    fi
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: Data Preparation"
+    for part in dev-clean test-clean dev-other test-other train-clean-100; do
+        # use underscore-separated names in data directories.
+        local/data_prep.sh ${LIBRISPEECH}/LibriSpeech/${part} data/${part//-/_}
+    done
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    log "stage 3: combine all training and development sets"
+    utils/combine_data.sh --extra_files utt2num_frames data/${train_dev} data/dev_clean data/dev_other
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    # use external data
+    if [ ! -e data/local/other_text/librispeech-lm-norm.txt.gz ]; then
+	log "stage 4: prepare external text data from http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz"
+        wget http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz -P data/local/other_text/
+    fi
+    if [ ! -e data/local/other_text/text ]; then
+	# provide utterance id to each texts
+	# e.g., librispeech_lng_00003686 A BANK CHECK
+	zcat data/local/other_text/librispeech-lm-norm.txt.gz | \
+	    awk '{ printf("librispeech_lng_%08d %s\n",NR,$0) } ' > data/local/other_text/text
+    fi
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/librispeech_100/asr1/local/data_prep.sh b/egs2/librispeech_100/asr1/local/data_prep.sh
new file mode 120000
index 00000000000..168bf5ad2f5
--- /dev/null
+++ b/egs2/librispeech_100/asr1/local/data_prep.sh
@@ -0,0 +1 @@
+../../../librispeech/asr1/local/data_prep.sh
\ No newline at end of file
diff --git a/egs2/librispeech_100/asr1/local/download_and_untar.sh b/egs2/librispeech_100/asr1/local/download_and_untar.sh
new file mode 120000
index 00000000000..8af58a9ab7b
--- /dev/null
+++ b/egs2/librispeech_100/asr1/local/download_and_untar.sh
@@ -0,0 +1 @@
+../../../librispeech/asr1/local/download_and_untar.sh
\ No newline at end of file
diff --git a/egs2/librispeech_100/asr1/local/path.sh b/egs2/librispeech_100/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/librispeech_100/asr1/path.sh b/egs2/librispeech_100/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/librispeech_100/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/librispeech_100/asr1/pyscripts b/egs2/librispeech_100/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/librispeech_100/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/librispeech_100/asr1/run.sh b/egs2/librispeech_100/asr1/run.sh
new file mode 100755
index 00000000000..287d2b2bfe6
--- /dev/null
+++ b/egs2/librispeech_100/asr1/run.sh
@@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set="train_clean_100"
+valid_set="dev"
+test_sets="test_clean test_other dev_clean dev_other"
+
+asr_tag=conformer_lr2e-3_warmup15k_amp_nondeterministic
+asr_config=conf/train_asr.yaml
+inference_config=conf/decode_asr.yaml
+
+./asr.sh \
+    --skip_data_prep false \
+    --skip_train false \
+    --skip_eval false \
+    --lang en \
+    --ngpu 1 \
+    --nj 32 \
+    --inference_nj 32 \
+    --nbpe 5000 \
+    --max_wav_duration 30 \
+    --speed_perturb_factors "0.9 1.0 1.1" \
+    --audio_format "flac.ark" \
+    --feats_type raw \
+    --use_lm false \
+    --asr_tag "${asr_tag}" \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --lm_train_text "data/${train_set}/text" \
+    --bpe_train_text "data/${train_set}/text" "$@"
diff --git a/egs2/librispeech_100/asr1/scripts b/egs2/librispeech_100/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/librispeech_100/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/librispeech_100/asr1/steps b/egs2/librispeech_100/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/librispeech_100/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/librispeech_100/asr1/utils b/egs2/librispeech_100/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/librispeech_100/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/lrs3/asr1/RESULTS.md b/egs2/lrs3/asr1/RESULTS.md
new file mode 100644
index 00000000000..be579a0ee64
--- /dev/null
+++ b/egs2/lrs3/asr1/RESULTS.md
@@ -0,0 +1,32 @@
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+# RESULTS
+## Environments
+- date: `Mon Mar  7 16:57:48 EST 2022`
+- python version: `3.8.12 (default, Oct 12 2021, 13:49:34)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.10.1`
+- Git hash: `ce48b589cd2d04b00a867a24352fc8d45fc6afc9`
+  - Commit date: `Mon Mar 7 09:20:56 2022 -0500`
+
+## asr_train_asr_transformer_no_lm
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|inference_asr_model_valid.acc.ave/dev|2686|30060|81.8|15.2|3.0|4.0|22.2|75.3|
+|inference_asr_model_valid.acc.ave/test|1321|9890|90.0|8.9|1.1|1.9|11.9|46.6|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|inference_asr_model_valid.acc.ave/dev|2686|155720|91.2|4.5|4.3|4.0|12.8|75.3|
+|inference_asr_model_valid.acc.ave/test|1321|49750|95.2|2.7|2.1|1.7|6.5|46.6|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|inference_asr_model_valid.acc.ave/dev|2686|36737|77.1|13.2|9.7|2.9|25.8|75.3|
+|inference_asr_model_valid.acc.ave/test|1321|11831|86.5|8.0|5.5|1.3|14.7|46.6|
+
diff --git a/egs2/lrs3/asr1/asr.sh b/egs2/lrs3/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/lrs3/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/lrs3/asr1/cmd.sh b/egs2/lrs3/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/lrs3/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/lrs3/asr1/conf/fbank.conf b/egs2/lrs3/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/lrs3/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/lrs3/asr1/conf/pbs.conf b/egs2/lrs3/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/lrs3/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/lrs3/asr1/conf/pitch.conf b/egs2/lrs3/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/lrs3/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/lrs3/asr1/conf/queue.conf b/egs2/lrs3/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/lrs3/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/lrs3/asr1/conf/slurm.conf b/egs2/lrs3/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/lrs3/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/lrs3/asr1/conf/train_asr_transformer.yaml b/egs2/lrs3/asr1/conf/train_asr_transformer.yaml
new file mode 100644
index 00000000000..6b2da79b3d4
--- /dev/null
+++ b/egs2/lrs3/asr1/conf/train_asr_transformer.yaml
@@ -0,0 +1,84 @@
+encoder: conformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    rel_pos_type: latest
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    use_cnn_module: true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+frontend_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 160
+
+seed: 2022
+log_interval: 400   
+num_att_plot: 0     
+num_workers: 4      
+sort_in_batch: descending       # how to sort data in making batch
+sort_batch: descending          # how to sort created batches
+batch_type: numel
+batch_bins: 16000000
+accum_grad: 4
+max_epoch: 70
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+use_amp: true      
+cudnn_deterministic: false  
+cudnn_benchmark: false      
+
+optim: adam
+optim_conf:
+    lr: 0.002
+    weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 15000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 27
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_ratio_range:
+    - 0.
+    - 0.05
+    num_time_mask: 5
diff --git a/egs2/lrs3/asr1/conf/train_lm.yaml b/egs2/lrs3/asr1/conf/train_lm.yaml
new file mode 100644
index 00000000000..c9260fcbf12
--- /dev/null
+++ b/egs2/lrs3/asr1/conf/train_lm.yaml
@@ -0,0 +1,15 @@
+grad_clip: 5.0
+batch_type: folded
+batch_size: 512
+optim: adam
+optim_conf:
+    lr: 0.0005
+lm: seq_rnn
+lm_conf:
+    unit: 650
+    nlayers: 4
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
diff --git a/egs2/lrs3/asr1/db.sh b/egs2/lrs3/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/lrs3/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/lrs3/asr1/local/data.sh b/egs2/lrs3/asr1/local/data.sh
new file mode 100755
index 00000000000..954e04473b9
--- /dev/null
+++ b/egs2/lrs3/asr1/local/data.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+. ./db.sh
+. ./path.sh
+. ./cmd.sh
+
+download_and_untar=false
+mp4_to_wav=true
+
+# Manually fill the lrs3_username, lrs3_password
+lrs3_username=
+lrs3_password=
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 1
+fi
+
+if [ -z "${LRS3}" ]; then
+    log "Fill the value of 'LRS3' of db.sh"
+    exit 1
+fi
+
+if $download_and_untar; then
+    log "Downloading and Untarring the LRS3 with username ${lrs3_username} and passwoed ${lrs3_password}."
+    local/download_and_untar.sh --remove-archive ${LRS3} ${lrs3_username} ${lrs3_password}
+fi
+
+if $mp4_to_wav; then
+    log "Extacting .wav files from .mp4 files and storing it under the same directory"
+    local/mp4_to_wav.sh ${LRS3}
+fi
+
+# Make the Folders where ESPNet data-prep files will be stored
+for dataset in train dev test; do
+    log "Creating the ./data/${dataset} folders"
+    mkdir -p ./data/${dataset}
+done
+
+# generate the utt2spk, wav.scp and text files
+log "Generating the utt2spk, wav.scp and text files"
+python3 ./local/data_prep.py --train_val_path ${LRS3}/trainval --test_path ${LRS3}/test 
+
+log "Generating the spk2utt files"
+utils/utt2spk_to_spk2utt.pl data/train/utt2spk > data/train/spk2utt
+utils/utt2spk_to_spk2utt.pl data/dev/utt2spk > data/dev/spk2utt
+utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt
+
+log "Fix sorting issues by calling fix_data_dir.sh"
+utils/fix_data_dir.sh data/train
+utils/fix_data_dir.sh data/test
+utils/fix_data_dir.sh data/dev
+
+log "Validate the data directory"
+utils/validate_data_dir.sh data/train --no-feats
+utils/validate_data_dir.sh data/test --no-feats
+utils/validate_data_dir.sh data/dev --no-feats
diff --git a/egs2/lrs3/asr1/local/data_prep.py b/egs2/lrs3/asr1/local/data_prep.py
new file mode 100644
index 00000000000..2ba8c7a816b
--- /dev/null
+++ b/egs2/lrs3/asr1/local/data_prep.py
@@ -0,0 +1,192 @@
+#!/usr/bin/env python
+
+# Copyright 2022  Debayan Ghosh
+#           2022  Carnegie Mellon University
+# Apache 2.0
+
+
+import os
+import argparse
+import logging
+import numpy as np
+from pathlib import Path
+from typing import Union, List
+
+
+class Utils:
+    @staticmethod
+    def read_text(text_file: str) -> str:
+        """Extracts the transcript from the database-reference text file
+
+        Args:
+        text_file (str) : Path to the database-reference text file
+
+        Return:
+        (str) The text transcript
+        """
+        with open(text_file, encoding="ISO-8859-1") as f:
+            first_line = f.readline()
+        text_val = first_line.split("Text:")[1]
+        text_val = text_val.strip("\n")
+        text_val = text_val.replace(
+            "{LG}", ""
+        )  # Special code to avoid scoring seg-fault due to utterance n706Sqp20Mk_50005
+        return text_val
+
+    @staticmethod
+    def save_list_to_file(list_data: list, save_path: str) -> None:
+        """ "Writes content of list_data to a file, line-by-line
+
+        Args:
+        list_data: List of Text to be saved to the text file
+        save_path: file to save the list_data
+        """
+        with open(save_path, "w") as f:
+            for line in list_data:
+                f.write(line + "\n")
+
+    @staticmethod
+    def get_parser():
+        """Returns the Parser object required to take inputs to data_prep.py"""
+        parser = argparse.ArgumentParser(
+            description="LRS-3 Data Preparation steps",
+            formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        )
+        parser.add_argument(
+            "--train_val_path", type=str, help="Path to the Train/ Validation files"
+        )
+        parser.add_argument("--test_path", type=str, help="Path to the Test files")
+        return parser
+
+
+class DatasetUtils:
+    @staticmethod
+    def train_val_files(
+        train_val_path: str, train_val_ratio: float = 0.92, random_seed: int = 0
+    ) -> Union[List[str], List[str]]:
+        """Splits the folders in 'train_val_path' into the train set and test set,
+           and returns the full Train/Validation files.
+
+        Args:
+        train_val_path (str): Path to the Folder with the Train/Val data
+        train_val_ratio (float): Ratio of the Train/Test file ratio
+        random_seed (int): Seed for the file shufling
+
+        Returns:
+        speakers_train (list) : Paths of Speaker Folders for Training Data
+        speakers_val (list) : Paths of Speaker Folders for Validation Data
+        """
+        speaker_folders = os.listdir(train_val_path)
+
+        np.random.seed(random_seed)
+        np.random.shuffle(speaker_folders)
+        num_speakers = len(speaker_folders)
+
+        num_train = int(train_val_ratio * num_speakers)
+        speakers_train = speaker_folders[0:num_train]
+        speakers_val = speaker_folders[num_train:]
+
+        speakers_train = [
+            os.path.join(train_val_path, folder) for folder in speakers_train
+        ]
+        speakers_val = [os.path.join(train_val_path, folder) for folder in speakers_val]
+
+        return speakers_train, speakers_val
+
+    @staticmethod
+    def test_files(test_path: str) -> List[str]:
+        """Returns the full path to the Test files
+
+        Args:
+        test_path (str): Path to the Folder with the Test data
+
+        Returns:
+        speakers_test (list) : Paths of Speaker Folders for Test Data
+        """
+        speakers_test = os.listdir(test_path)
+        speakers_test = [os.path.join(test_path, folder) for folder in speakers_test]
+        return speakers_test
+
+    @staticmethod
+    def generate_espnet_data(
+        speaker_folders: list, dataset: str
+    ) -> Union[List[str], List[str], List[str]]:
+        """Generates the utt2spk, text and wav data required by ESPNET
+
+        Args:
+        speaker_folders (list): The folders from where to extract data
+        dataset (str): The dataset we are working with (train, test, dev)
+
+        Returns:
+        utt2spk (list) : Utterence to Speaker data
+        text (list) : Utterence to Transcript data
+        wav (list) : Utterence to Wav-Path data
+        """
+        utt2spk = []
+        text = []
+        wav = []
+
+        for speaker_folder in speaker_folders:
+
+            spk_id = os.path.basename(speaker_folder)
+
+            for wav_file in os.listdir(speaker_folder):
+
+                if not wav_file.endswith(".wav"):
+                    continue
+                text_file = wav_file.replace("wav", "txt")
+
+                wav_full_path = os.path.join(speaker_folder, wav_file)
+                text_full_path = os.path.join(speaker_folder, text_file)
+
+                assert os.path.exists(wav_full_path)
+                assert os.path.exists(text_full_path)
+
+                utt_id = spk_id + "_" + Path(wav_full_path).stem
+
+                utt2spk.append(utt_id + " " + spk_id)
+                wav.append(utt_id + " " + wav_full_path)
+                text.append(utt_id + " " + Utils.read_text(text_full_path))
+        return utt2spk, text, wav
+
+    @staticmethod
+    def perform_data_prep(speaker_folders: list, dataset: str) -> None:
+        """Performs ESPNET related Data-Preparation.
+        Generates the utt2spk, text and wav.scp files
+
+        Args:
+        speaker_folders (list): The folders from where to extract data
+        dataset (str): The dataset we are working with (train, test, dev)
+        """
+        utt2spk, text, wav = DatasetUtils.generate_espnet_data(speaker_folders, dataset)
+
+        utt2spk_file = os.path.join("data", dataset, "utt2spk")
+        text_file = os.path.join("data", dataset, "text")
+        wav_file = os.path.join("data", dataset, "wav.scp")
+
+        Utils.save_list_to_file(utt2spk, utt2spk_file)
+        Utils.save_list_to_file(text, text_file)
+        Utils.save_list_to_file(wav, wav_file)
+
+
+def main():
+    parser = Utils.get_parser()
+    args = parser.parse_args()
+    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
+    logging.basicConfig(level=logging.INFO, format=logfmt)
+
+    train_folders, dev_folders = DatasetUtils.train_val_files(args.train_val_path)
+    test_folders = DatasetUtils.test_files(args.test_path)
+
+    logging.info(f"Performing Data Preparation for TEST")
+    DatasetUtils.perform_data_prep(test_folders, "test")
+
+    logging.info(f"Performing Data Preparation for TRAIN")
+    DatasetUtils.perform_data_prep(train_folders, "train")
+
+    logging.info(f"Performing Data Preparation for DEV")
+    DatasetUtils.perform_data_prep(dev_folders, "dev")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs2/lrs3/asr1/local/download_and_untar.sh b/egs2/lrs3/asr1/local/download_and_untar.sh
new file mode 100755
index 00000000000..e0960f4b9ee
--- /dev/null
+++ b/egs2/lrs3/asr1/local/download_and_untar.sh
@@ -0,0 +1,82 @@
+#!/usr/bin/env bash
+
+remove_archive=false
+
+if [ "$1" == --remove-archive ]; then
+  remove_archive=true
+  shift
+fi
+
+if [ $# -lt 3 ]; then
+  echo "Usage: $0 [--remove-archive] <data-base-path> <lrs3-username> <lrs3-password>"
+  echo "--args [--remove-archive] (Optional) : Remove tar files after successfully untaring"
+  echo "--args <data-base-path> : The path where to download the dataset"
+  echo "--args <lrs3-username> : The username required to download the dataset"
+  echo "--args <lrs3-password> : The password required to download the dataset"
+  echo "If you do not have a username/password, please request from: https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs3.html"
+  exit 1
+fi
+
+data=$1
+lrs3_username=$2
+lrs3_password=$3
+lrs3_base_url=https://thor.robots.ox.ac.uk/~vgg/data/lip_reading/data3/
+lrs3_train_val_file=lrs3_trainval.zip
+lrs3_test_file=lrs3_test_v0.4.zip
+
+echo "Downloading Train/Val data from ${lrs3_base_url}${lrs3_train_val_file}"
+
+if [ -f ${data}/${lrs3_train_val_file} ]; then
+     rm  ${data}/${lrs3_train_val_file}
+fi
+
+if ! wget  --user ${lrs3_username} --password ${lrs3_password} -P $data  ${lrs3_base_url}${lrs3_train_val_file} ; then
+  echo "$0: error executing wget  --user ${lrs3_username} --password ${lrs3_password} -P $data  ${lrs3_base_url}${lrs3_train_val_file}"
+  exit 1
+fi
+
+echo "Downloading Test data from ${lrs3_base_url}${lrs3_test_file}"
+
+if [ -f ${data}/${lrs3_test_file} ]; then
+     rm  ${data}/${lrs3_test_file}
+fi
+
+if ! wget  --user ${lrs3_username} --password ${lrs3_password} -P $data   ${lrs3_base_url}${lrs3_test_file} ; then
+  echo "$0: error executing wget  --user ${lrs3_username} --password ${lrs3_password} -P $data   ${lrs3_base_url}${lrs3_test_file}"
+  exit 1
+fi
+
+
+if [ -e ${data}/trainval ]; then
+    echo "Removing existing files in ${data}/trainval before unzipping"
+    rm -rf ${data}/trainval
+fi
+
+echo "Un-Zipping Train/Val data from ${data}/${lrs3_train_val_file}"
+
+if ! unzip -qq ${data}/${lrs3_train_val_file} -d ${data}; then
+    echo "Failed to unzip ${data}/${lrs3_train_val_file}"
+    exit 1
+fi
+
+
+if [ -e ${data}/test ]; then
+    echo "Removing existing files in ${data}/test before unzipping"
+    rm -rf ${data}/test
+fi
+
+echo "Un-Zipping Test data from ${data}/${lrs3_test_file}"
+
+if ! unzip -qq ${data}/${lrs3_test_file} -d ${data}; then
+    echo "Failed to unzip ${data}/${lrs3_test_file}"
+    exit 1
+fi
+
+echo "$0: Successfully downloaded and un-tarred ${data}/${lrs3_train_val_file} and ${data}/${lrs3_test_file}"
+
+if $remove_archive; then
+  echo "$0: removing${data}/${lrs3_train_val_file} and  ${data}/${lrs3_test_file} file since --remove-archive option was supplied."
+  rm ${data}/${lrs3_train_val_file}
+  rm ${data}/${lrs3_test_file}
+fi
+
diff --git a/egs2/lrs3/asr1/local/mp4_to_wav.sh b/egs2/lrs3/asr1/local/mp4_to_wav.sh
new file mode 100755
index 00000000000..374d80de631
--- /dev/null
+++ b/egs2/lrs3/asr1/local/mp4_to_wav.sh
@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+
+
+if [ $# -lt 1 ]; then
+  echo "Usage: $0 <data-base-path>"
+  echo "--args <data-base-path> : The path to the dataset"
+  exit 1
+fi
+
+data=$1
+
+for dataset in trainval test; do
+    for mp4_path in ${data}/${dataset}/*/*.mp4; do
+        # Store the .wav file in the same folder where the .mp4 file is
+        wav_path=${mp4_path//.mp4/.wav}
+        if ! [ -f  ${wav_path} ]; then 
+          ffmpeg -y -i ${mp4_path} -loglevel panic -ar 16000 -ac 1 ${wav_path} 
+        fi
+    done
+done
diff --git a/egs2/lrs3/asr1/local/path.sh b/egs2/lrs3/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/lrs3/asr1/path.sh b/egs2/lrs3/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/lrs3/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/lrs3/asr1/pyscripts b/egs2/lrs3/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/lrs3/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/lrs3/asr1/run.sh b/egs2/lrs3/asr1/run.sh
new file mode 100755
index 00000000000..54d7f17454d
--- /dev/null
+++ b/egs2/lrs3/asr1/run.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set="train"
+valid_set="dev"
+test_sets="test dev"
+
+
+asr_tag=train_asr_transformer
+asr_config=conf/train_asr_transformer.yaml
+lm_config=conf/train_lm.yaml  # Not Used, as use_lm=false
+
+./asr.sh \
+    --skip_data_prep false \
+    --skip_train false \
+    --skip_eval false \
+    --stage 1 \
+    --lang en \
+    --ngpu 1 \
+    --nj 32 \
+    --inference_nj 32 \
+    --nbpe 5000 \
+    --max_wav_duration 30 \
+    --speed_perturb_factors "0.9 1.0 1.1" \
+    --audio_format "wav" \
+    --feats_type raw \
+    --use_lm false \
+    --asr_tag "${asr_tag}" \
+    --lm_config ${lm_config} \
+    --asr_config "${asr_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --lm_train_text "data/${train_set}/text" \
+    --bpe_train_text "data/${train_set}/text" "$@"
diff --git a/egs2/lrs3/asr1/scripts b/egs2/lrs3/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/lrs3/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/lrs3/asr1/steps b/egs2/lrs3/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/lrs3/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/lrs3/asr1/utils b/egs2/lrs3/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/lrs3/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/mediaspeech/asr1/README.md b/egs2/mediaspeech/asr1/README.md
new file mode 100644
index 00000000000..f75dda826d3
--- /dev/null
+++ b/egs2/mediaspeech/asr1/README.md
@@ -0,0 +1,34 @@
+# Hugging Face
+Model is available in Hugging Face: https://huggingface.co/espnet/mediaspeech-fr-hubert
+
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+# RESULTS
+## Environments
+- date: `Tue Mar 22 13:50:31 UTC 2022`
+- python version: `3.7.11 (default, Jul 27 2021, 14:32:16)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.10.1`
+- Git hash: `1991a25855821b8b61d775681aa0cdfd6161bbc8`
+  - Commit date: `Mon Mar 21 22:19:19 2022 +0800`
+
+## asr_train_asr_hubert_raw_as_bpe150_sp
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|inference_asr_model_valid.acc.ave/dev_as|249|10072|49.7|41.2|9.1|7.0|57.2|100.0|
+|inference_asr_model_valid.acc.ave/test_as|249|9920|51.1|40.1|8.9|6.5|55.4|100.0|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|inference_asr_model_valid.acc.ave/dev_as|249|58679|80.9|8.0|11.1|7.2|26.3|100.0|
+|inference_asr_model_valid.acc.ave/test_as|249|58694|82.1|7.2|10.8|7.1|25.0|100.0|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|inference_asr_model_valid.acc.ave/dev_as|249|30837|69.5|19.0|11.5|6.3|36.8|100.0|
+|inference_asr_model_valid.acc.ave/test_as|249|30942|70.7|17.9|11.4|6.0|35.3|100.0|
diff --git a/egs2/mediaspeech/asr1/asr.sh b/egs2/mediaspeech/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/mediaspeech/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/mediaspeech/asr1/cmd.sh b/egs2/mediaspeech/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/mediaspeech/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/mediaspeech/asr1/conf/decode_asr.yaml b/egs2/mediaspeech/asr1/conf/decode_asr.yaml
new file mode 120000
index 00000000000..f3f59d5ac2b
--- /dev/null
+++ b/egs2/mediaspeech/asr1/conf/decode_asr.yaml
@@ -0,0 +1 @@
+tuning/decode_transformer.yaml
\ No newline at end of file
diff --git a/egs2/mediaspeech/asr1/conf/fbank.conf b/egs2/mediaspeech/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/mediaspeech/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/mediaspeech/asr1/conf/pbs.conf b/egs2/mediaspeech/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/mediaspeech/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/mediaspeech/asr1/conf/pitch.conf b/egs2/mediaspeech/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/mediaspeech/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/mediaspeech/asr1/conf/queue.conf b/egs2/mediaspeech/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/mediaspeech/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/mediaspeech/asr1/conf/slurm.conf b/egs2/mediaspeech/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/mediaspeech/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/mediaspeech/asr1/conf/train_asr.yaml b/egs2/mediaspeech/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..00ea2f24791
--- /dev/null
+++ b/egs2/mediaspeech/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+tuning/train_asr_transformer.yaml
\ No newline at end of file
diff --git a/egs2/mediaspeech/asr1/conf/tuning/train_asr_conformer.yaml b/egs2/mediaspeech/asr1/conf/tuning/train_asr_conformer.yaml
new file mode 100644
index 00000000000..8ad9f0d693f
--- /dev/null
+++ b/egs2/mediaspeech/asr1/conf/tuning/train_asr_conformer.yaml
@@ -0,0 +1,70 @@
+cudnn_benchmark: false 
+cudnn_deterministic: false 
+use_amp: true 
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+optim: adam
+optim_conf:
+    lr: 0.0002
+scheduler: warmuplr     # pytorch v1.1.0+ required #Tune warmup steps
+scheduler_conf:
+    warmup_steps: 25000
+max_epoch: 50
+
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10 
\ No newline at end of file
diff --git a/egs2/mediaspeech/asr1/conf/tuning/train_asr_fused.yaml b/egs2/mediaspeech/asr1/conf/tuning/train_asr_fused.yaml
new file mode 100644
index 00000000000..38727e5a2bf
--- /dev/null
+++ b/egs2/mediaspeech/asr1/conf/tuning/train_asr_fused.yaml
@@ -0,0 +1,96 @@
+encoder: transformer
+encoder_conf:
+    input_layer: "conv2d"
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+
+decoder: transformer
+decoder_conf:
+    input_layer: "embed"
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+#frontend related 
+freeze_param: ["frontend.upstream"] 
+
+frontend: fused
+frontend_conf: 
+  frontends:
+    - frontend_type: s3prl
+      frontend_conf:
+        upstream: hubert_large_ll60k
+      download_dir: ./hub
+      multilayer_feature: True
+      
+    - frontend_type: default
+      n_fft: 512
+      win_length: 400
+      hop_length: 160
+
+    - frontend_type: s3prl
+      frontend_conf:
+        upstream: wav2vec2_large_ll60k
+      download_dir: ./hub
+      multilayer_feature: True
+
+  align_method: linear_projection
+  proj_dim: 100     
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 80
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.1
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+
+# minibatch related
+batch_type: folded
+batch_size: 32
+
+# optimization related
+optim: adam
+accum_grad: 2
+grad_clip: 5
+patience: 15
+max_epoch: 100
+optim_conf:
+    lr: 1
+scheduler: noamlr
+scheduler_conf:
+    warmup_steps: 4000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+# others:
+best_model_criterion:
+-  - valid
+   - acc
+   - max
+keep_nbest_models: 5
+
+init: xavier_uniform
diff --git a/egs2/mediaspeech/asr1/conf/tuning/train_asr_hubert.yaml b/egs2/mediaspeech/asr1/conf/tuning/train_asr_hubert.yaml
new file mode 100644
index 00000000000..db22456c95d
--- /dev/null
+++ b/egs2/mediaspeech/asr1/conf/tuning/train_asr_hubert.yaml
@@ -0,0 +1,86 @@
+cudnn_benchmark: false 
+cudnn_deterministic: false 
+use_amp: true 
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+optim: adam
+optim_conf:
+    lr: 0.0002
+scheduler: warmuplr     # pytorch v1.1.0+ required #Tune warmup steps
+scheduler_conf:
+    warmup_steps: 25000
+max_epoch: 50
+
+freeze_param: [
+"frontend.upstream"
+]
+
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+frontend: s3prl
+frontend_conf:
+    frontend_conf:
+        upstream: hubert_large_ll60k  # Note: If the upstream is changed, please change the input_size in the preencoder.
+    download_dir: ./hub
+    multilayer_feature: True
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 80
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10 
\ No newline at end of file
diff --git a/egs2/mediaspeech/asr1/conf/tuning/train_asr_transformer.yaml b/egs2/mediaspeech/asr1/conf/tuning/train_asr_transformer.yaml
new file mode 100644
index 00000000000..ac6366ffb1e
--- /dev/null
+++ b/egs2/mediaspeech/asr1/conf/tuning/train_asr_transformer.yaml
@@ -0,0 +1,48 @@
+encoder: transformer
+encoder_conf:
+    input_layer: "conv2d"
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+
+decoder: transformer
+decoder_conf:
+    input_layer: "embed"
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# minibatch related
+batch_type: folded
+batch_size: 32
+
+# optimization related
+optim: adam
+accum_grad: 2
+grad_clip: 5
+patience: 0
+max_epoch: 10
+optim_conf:
+    lr: 1.0
+scheduler: noamlr
+scheduler_conf:
+    warmup_steps: 25000
+
+# others:
+best_model_criterion:
+-  - valid
+   - acc
+   - max
+keep_nbest_models: 10
+
+init: xavier_uniform # Our empirical studies shows that this initialization
+                     # is very important to low-resource ASR training
diff --git a/egs2/mediaspeech/asr1/db.sh b/egs2/mediaspeech/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/mediaspeech/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/mediaspeech/asr1/local/data.sh b/egs2/mediaspeech/asr1/local/data.sh
new file mode 100755
index 00000000000..a8284fdeb88
--- /dev/null
+++ b/egs2/mediaspeech/asr1/local/data.sh
@@ -0,0 +1,62 @@
+#!/usr/bin/env bash
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+# general configuration
+stage=0       # start from 0 if you need to start from data preparation
+stop_stage=100
+SECONDS=0
+
+ . utils/parse_options.sh || exit 1;
+
+# base url for downloads.
+data_url=https://us.openslr.org/resources/108/FR.tgz
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+mkdir -p ${MEDIASPEECH}
+if [ -z "${MEDIASPEECH}" ]; then
+    log "Fill the value of 'MEDIASPEECH' of db.sh"
+    exit 1
+fi
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log "data preparation started"
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then 
+    log "stage1: Download data to ${MEDIASPEECH}"
+    log "The default data of this recipe is from mediaspeech - french"
+    local/download_and_untar.sh ${MEDIASPEECH} ${data_url} FR.tgz
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage2: Preparing data for mediaspeech"
+    ### Task dependent. You have to make data the following preparation part by yourself.
+    mkdir -p data/{train_as,dev_as,test_as,validated_as}
+    python3 local/data_prep.py \
+        --data_path ${MEDIASPEECH}/FR \
+        --train_dir data/train_as \
+        --dev_dir data/dev_as \
+        --test_dir data/test_as \
+        --validated_dir data/validated_as \
+        --dev_ratio 0.1 \
+        --test_ratio 0.1 \
+        --validated_ratio 0.01
+    for x in train_as dev_as test_as validated_as; do
+        utils/utt2spk_to_spk2utt.pl data/${x}/utt2spk > data/${x}/spk2utt
+        utils/fix_data_dir.sh data/${x}
+        utils/validate_data_dir.sh --no-feats data/${x}
+    done
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/mediaspeech/asr1/local/data_prep.py b/egs2/mediaspeech/asr1/local/data_prep.py
new file mode 100755
index 00000000000..42162c53da0
--- /dev/null
+++ b/egs2/mediaspeech/asr1/local/data_prep.py
@@ -0,0 +1,104 @@
+import os
+import os.path
+import json
+import glob
+import math
+import argparse
+import random
+
+
+parser = argparse.ArgumentParser(description="Prepare mediaspeech")
+parser.add_argument(
+    "--data_path", type=str, help="Path to the directory containing all files"
+)
+parser.add_argument("--train_dir", type=str, help="Path to the train data")
+parser.add_argument("--dev_dir", type=str, help="Path to the dev data")
+parser.add_argument("--test_dir", type=str, help="Path to the test data")
+parser.add_argument("--validated_dir", type=str, help="Path to the validated data")
+parser.add_argument("--dev_ratio", type=float, default=0.136, help="Ratio of dev set")
+parser.add_argument("--test_ratio", type=float, default=0.136, help="Ratio of test set")
+parser.add_argument(
+    "--validated_ratio", type=float, default=0.136, help="Ratio of validated set"
+)
+args = parser.parse_args()
+
+file_names = [
+    name[:-5] for name in os.listdir(args.data_path) if name.endswith(".flac")
+]
+samp_cnt = len(file_names)
+dev_samp_cnt = int(samp_cnt * args.dev_ratio)
+test_samp_cnt = int(samp_cnt * args.test_ratio)
+validated_samp_cnt = int(samp_cnt * args.validated_ratio)
+train_samp_cnt = samp_cnt - dev_samp_cnt - test_samp_cnt - validated_samp_cnt
+
+print(
+    "samp_cnt, dev_samp_cnt, test_samp_cnt, validated_samp_cnt, train_samp_cnt: ",
+    samp_cnt,
+    dev_samp_cnt,
+    test_samp_cnt,
+    validated_samp_cnt,
+    train_samp_cnt,
+)
+print("file_names: ", file_names[:5])
+
+random.seed(2022)
+random.shuffle(file_names)
+# train_file_names = file_names[:train_samp_cnt]
+# dev_file_names = file_names[train_samp_cnt: train_samp_cnt + dev_samp_cnt]
+# test_file_names = file_names[train_samp_cnt + dev_samp_cnt:]
+
+train_samples = []
+dev_samples = []
+test_samples = []
+validated_samples = []
+
+for file_idx in range(samp_cnt):
+    file_name = file_names[file_idx]
+    text_file_path = os.path.join(args.data_path, file_name + ".txt")
+    aud_file_path = os.path.join(args.data_path, file_name + ".flac")
+
+    with open(text_file_path, "r") as f:
+        text = f.readlines()[0]
+
+    processed_sample = {
+        "user_id": 0,
+        "text": text,
+        "id": file_idx,
+        "abs_path": "ffmpeg -i %s -f wav -ar 16000 -ab 16 -ac 1 - |"
+        % os.path.abspath(aud_file_path),
+    }
+    if file_idx < train_samp_cnt:
+        train_samples.append(processed_sample)
+    elif file_idx < train_samp_cnt + dev_samp_cnt:
+        dev_samples.append(processed_sample)
+    elif file_idx < train_samp_cnt + dev_samp_cnt + validated_samp_cnt:
+        validated_samples.append(processed_sample)
+    else:
+        test_samples.append(processed_sample)
+
+for setname in ["train", "dev", "test", "validated"]:
+    if setname == "train":
+        sample_list = train_samples
+        dest_dir = args.train_dir
+    elif setname == "dev":
+        sample_list = dev_samples
+        dest_dir = args.dev_dir
+    elif setname == "test":
+        sample_list = test_samples
+        dest_dir = args.test_dir
+    elif setname == "validated":
+        sample_list = validated_samples
+        dest_dir = args.validated_dir
+    else:
+        raise RuntimeError
+
+    with open(os.path.join(dest_dir, "text"), "w") as text_f, open(
+        os.path.join(dest_dir, "wav.scp"), "w"
+    ) as wav_scp_f, open(os.path.join(dest_dir, "utt2spk"), "w") as utt2spk_f, open(
+        os.path.join(dest_dir, "utt2gender"), "w"
+    ) as utt2gndr_f:
+        for sample in sample_list:
+            text_f.write(f"{int(sample['id']):08d} {sample['text']}\n")
+            wav_scp_f.write(f"{int(sample['id']):08d} {sample['abs_path']}\n")
+            utt2spk_f.write(f"{int(sample['id']):08d} {int(sample['id']):08d}\n")
+            utt2gndr_f.write(f"{int(sample['id']):08d}" + " f\n")
diff --git a/egs2/mediaspeech/asr1/local/download_and_untar.sh b/egs2/mediaspeech/asr1/local/download_and_untar.sh
new file mode 120000
index 00000000000..794196074a4
--- /dev/null
+++ b/egs2/mediaspeech/asr1/local/download_and_untar.sh
@@ -0,0 +1 @@
+../../../../egs/commonvoice/asr1/local/download_and_untar.sh
\ No newline at end of file
diff --git a/egs2/mediaspeech/asr1/local/path.sh b/egs2/mediaspeech/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/mediaspeech/asr1/path.sh b/egs2/mediaspeech/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/mediaspeech/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/mediaspeech/asr1/pyscripts b/egs2/mediaspeech/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/mediaspeech/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/mediaspeech/asr1/run.sh b/egs2/mediaspeech/asr1/run.sh
new file mode 100755
index 00000000000..66cbd0da2ae
--- /dev/null
+++ b/egs2/mediaspeech/asr1/run.sh
@@ -0,0 +1,45 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+lang=fr
+
+train_set=train_"$(echo "${lang}" | tr - _)"
+train_dev=dev_"$(echo "${lang}" | tr - _)"
+test_set="${train_dev} test_$(echo ${lang} | tr - _)"
+
+asr_config=conf/tuning/train_asr_conformer5.yaml
+lm_config=conf/train_lm.yaml
+inference_config=conf/decode_asr.yaml
+
+if [[ "zh" == *"${lang}"* ]]; then
+  nbpe=2500
+elif [[ "fr" == *"${lang}"* ]]; then
+  nbpe=350
+elif [[ "es" == *"${lang}"* ]]; then
+  nbpe=235
+else
+  nbpe=150
+fi
+
+./asr.sh \
+    --ngpu 4 \
+    --lang "${lang}" \
+    --local_data_opts "--lang ${lang}" \
+    --use_lm true \
+    --lm_config "${lm_config}" \
+    --token_type bpe \
+    --nbpe $nbpe \
+    --feats_type raw \
+    --speed_perturb_factors "0.9 1.0 1.1" \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${train_dev}" \
+    --test_sets "${test_set}" \
+    --bpe_train_text "data/${train_set}/text" \
+    --lm_train_text "data/${train_set}/text" "$@"
+
diff --git a/egs2/mediaspeech/asr1/scripts b/egs2/mediaspeech/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/mediaspeech/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/mediaspeech/asr1/steps b/egs2/mediaspeech/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/mediaspeech/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/mediaspeech/asr1/utils b/egs2/mediaspeech/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/mediaspeech/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/microsoft_speech/asr1/README.md b/egs2/microsoft_speech/asr1/README.md
new file mode 100644
index 00000000000..4cd38303631
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/README.md
@@ -0,0 +1,96 @@
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+
+##  ASR for Microsoft Speech Corpus: 
+
+
+### Data citations
+``` 
+ @inproceedings{srivastava2018interspeech,
+  title={Interspeech 2018 Low Resource Automatic Speech Recognition Challenge for Indian Languages.},
+  author={Srivastava, Brij Mohan Lal and Sitaram, Sunayana and Mehta, Rupesh Kumar and Mohan, Krishna Doss and Matani, Pallavi and Satpal, Sandeepkumar and Bali, Kalika and Srikanth, Radhakrishnan and Nayak, Niranjan},
+  booktitle={SLTU},
+  pages={11--14},
+  year={2018}
+ }
+```
+
+
+We provide in this recipe configurations to train on the above data by Microsoft on Telegu language which contains 40 hours of data.
+
+
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+# RESULTS
+## Environments
+- date: `Tue Mar 22 20:03:41 EDT 2022`
+- python version: `3.9.7 (default, Sep 16 2021, 13:09:58)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.10.1`
+- Git hash: `b274c4ea66c59600599a4a340296bb15412b3a9c`
+  - Commit date: `Wed Mar 2 17:11:19 2022 -0500`
+
+## asr_train_asr_conformer5_hubert_raw_te_bpe200_sp
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe200_valid.loss.ave_ngram_ngram_3gram_asr_model_valid.acc.ave/dev_te|3382|18352|75.8|21.5|2.7|3.2|27.3|61.7|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe200_valid.loss.ave_ngram_ngram_3gram_asr_model_valid.acc.ave/test_te|3040|28413|79.5|18.5|2.1|2.7|23.2|78.1|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe200_valid.loss.ave_ngram_ngram_3gram_asr_model_valid.acc.ave/dev_te|3382|136156|94.8|2.7|2.5|2.1|7.3|61.7|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe200_valid.loss.ave_ngram_ngram_3gram_asr_model_valid.acc.ave/test_te|3040|229419|96.1|1.9|1.9|1.7|5.6|78.1|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe200_valid.loss.ave_ngram_ngram_3gram_asr_model_valid.acc.ave/dev_te|3382|77163|89.8|6.5|3.7|2.0|12.3|61.7|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe200_valid.loss.ave_ngram_ngram_3gram_asr_model_valid.acc.ave/test_te|3040|133033|92.6|4.8|2.6|1.7|9.1|78.1|
+
+## asr_train_asr_conformer_wav2vec2_raw_te_bpe200_sp
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe200_valid.loss.ave_ngram_ngram_3gram_asr_model_valid.acc.ave/dev_te|3382|18352|75.6|21.6|2.8|2.8|27.3|60.4|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe200_valid.loss.ave_ngram_ngram_3gram_asr_model_valid.acc.ave/test_te|3040|28413|78.5|19.2|2.2|2.5|24.0|78.6|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe200_valid.loss.ave_ngram_ngram_3gram_asr_model_valid.acc.ave/dev_te|3382|136156|94.9|2.6|2.5|2.0|7.1|60.4|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe200_valid.loss.ave_ngram_ngram_3gram_asr_model_valid.acc.ave/test_te|3040|229419|95.9|2.0|2.1|1.6|5.7|78.6|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe200_valid.loss.ave_ngram_ngram_3gram_asr_model_valid.acc.ave/dev_te|3382|77163|90.0|6.4|3.6|2.0|11.9|60.4|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe200_valid.loss.ave_ngram_ngram_3gram_asr_model_valid.acc.ave/test_te|3040|133033|92.2|5.0|2.8|1.7|9.4|78.6|
+
+
+## asr_train_asr_conformer5_raw_multilingual_bpe400_sp 
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer_lm_lm_train_lm_transformer_mu_bpe400_valid.loss.ave_ngram_ngram_3gram_asr_model_valid.acc.ave/dev_te|3382|18352|77.3|20.1|2.5|2.9|25.5|56.6|
+|decode_transformer_lm_lm_train_lm_transformer_mu_bpe400_valid.loss.ave_ngram_ngram_3gram_asr_model_valid.acc.ave/test_te|3040|28413|79.4|18.4|2.2|2.5|23.2|78.0|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer_lm_lm_train_lm_transformer_mu_bpe400_valid.loss.ave_ngram_ngram_3gram_asr_model_valid.acc.ave/dev_te|3382|136156|95.3|2.4|2.2|1.9|6.6|56.6|
+|decode_transformer_lm_lm_train_lm_transformer_mu_bpe400_valid.loss.ave_ngram_ngram_3gram_asr_model_valid.acc.ave/test_te|3040|229419|96.0|2.0|2.0|1.6|5.6|78.0|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer_lm_lm_train_lm_transformer_mu_bpe400_valid.loss.ave_ngram_ngram_3gram_asr_model_valid.acc.ave/dev_te|3382|76064|90.7|6.1|3.2|1.9|11.2|56.6|
+|decode_transformer_lm_lm_train_lm_transformer_mu_bpe400_valid.loss.ave_ngram_ngram_3gram_asr_model_valid.acc.ave/test_te|3040|132019|92.5|4.9|2.6|1.7|9.1|78.0|
diff --git a/egs2/microsoft_speech/asr1/__MACOSX/._local b/egs2/microsoft_speech/asr1/__MACOSX/._local
new file mode 100755
index 00000000000..1781235fa80
Binary files /dev/null and b/egs2/microsoft_speech/asr1/__MACOSX/._local differ
diff --git a/egs2/microsoft_speech/asr1/asr.sh b/egs2/microsoft_speech/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/microsoft_speech/asr1/cmd.sh b/egs2/microsoft_speech/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/microsoft_speech/asr1/conf/decode_asr.yaml b/egs2/microsoft_speech/asr1/conf/decode_asr.yaml
new file mode 120000
index 00000000000..a241adeae2e
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/conf/decode_asr.yaml
@@ -0,0 +1 @@
+./tuning/decode_transformer.yaml
\ No newline at end of file
diff --git a/egs2/microsoft_speech/asr1/conf/fbank.conf b/egs2/microsoft_speech/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/microsoft_speech/asr1/conf/pbs.conf b/egs2/microsoft_speech/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/microsoft_speech/asr1/conf/pitch.conf b/egs2/microsoft_speech/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/microsoft_speech/asr1/conf/queue.conf b/egs2/microsoft_speech/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/microsoft_speech/asr1/conf/slurm.conf b/egs2/microsoft_speech/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/microsoft_speech/asr1/conf/train_asr.yaml b/egs2/microsoft_speech/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..0ebce97bf61
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+./tuning/train_asr_conformer5_hubert.yaml
\ No newline at end of file
diff --git a/egs2/microsoft_speech/asr1/conf/train_lm.yaml b/egs2/microsoft_speech/asr1/conf/train_lm.yaml
new file mode 120000
index 00000000000..57031eb88ad
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/conf/train_lm.yaml
@@ -0,0 +1 @@
+./tuning/train_lm_transformer.yaml
\ No newline at end of file
diff --git a/egs2/microsoft_speech/asr1/conf/tuning/decode_transformer.yaml b/egs2/microsoft_speech/asr1/conf/tuning/decode_transformer.yaml
new file mode 100644
index 00000000000..9c03d913dad
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/conf/tuning/decode_transformer.yaml
@@ -0,0 +1,7 @@
+batch_size: 1
+beam_size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.5
+lm_weight: 0.3
\ No newline at end of file
diff --git a/egs2/microsoft_speech/asr1/conf/tuning/train_asr_conformer5.yaml b/egs2/microsoft_speech/asr1/conf/tuning/train_asr_conformer5.yaml
new file mode 100644
index 00000000000..982f9a5b994
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/conf/tuning/train_asr_conformer5.yaml
@@ -0,0 +1,84 @@
+cudnn_benchmark: false 
+cudnn_deterministic: false 
+use_amp: true 
+cudnn_benchmark: false 
+cudnn_deterministic: false 
+use_amp: true 
+# network architecture
+
+# frontend related
+frontend: default
+frontend_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 160
+
+# encoder related
+encoder: conformer
+encoder_conf:
+    input_layer: conv2d
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    macaron_style: true
+    use_cnn_module: true
+    cnn_module_kernel: 15
+
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: embed
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# optimization related
+optim: adam
+accum_grad: 1
+grad_clip: 3
+max_epoch: 50
+optim_conf:
+    lr: 4.0
+scheduler: noamlr
+scheduler_conf:
+    model_size: 256
+    warmup_steps: 25000
+
+# minibatch related
+batch_type: numel
+batch_bins: 2500000
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/microsoft_speech/asr1/conf/tuning/train_asr_conformer5_hubert.yaml b/egs2/microsoft_speech/asr1/conf/tuning/train_asr_conformer5_hubert.yaml
new file mode 100644
index 00000000000..312e30e6e60
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/conf/tuning/train_asr_conformer5_hubert.yaml
@@ -0,0 +1,90 @@
+batch_type: numel
+batch_bins: 2000000
+accum_grad: 4
+max_epoch: 40
+patience: 5
+# The initialization method for model parameters
+init: xavier_uniform
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 5
+
+freeze_param: [
+"frontend.upstream"
+]
+
+frontend: s3prl
+frontend_conf:
+    frontend_conf:
+        upstream: hubert_large_ll60k  # Note: If the upstream is changed, please change the input_size in the preencoder.
+    download_dir: ./hub
+    multilayer_feature: true
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 80
+
+encoder: conformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d2
+    normalize_before: true
+    macaron_style: true
+    rel_pos_type: latest
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module: true
+    cnn_module_kernel: 15
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# ctc related
+ctc_conf:
+    ignore_nan_grad: true
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+optim: adam
+optim_conf:
+    lr: 0.005
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 30000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
\ No newline at end of file
diff --git a/egs2/microsoft_speech/asr1/conf/tuning/train_asr_conformer_wav2vec2.yaml b/egs2/microsoft_speech/asr1/conf/tuning/train_asr_conformer_wav2vec2.yaml
new file mode 100644
index 00000000000..21a13a60edf
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/conf/tuning/train_asr_conformer_wav2vec2.yaml
@@ -0,0 +1,91 @@
+# network architecture
+# encoder related
+batch_type: numel
+batch_bins: 2000000
+encoder: conformer
+accum_grad: 2
+
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d2
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+optim: adam
+optim_conf:
+    lr: 0.002
+    weight_decay: 1.0e-06
+scheduler: warmuplr     # pytorch v1.1.0+ required #Tune warmup steps
+scheduler_conf:
+    warmup_steps: 5000
+max_epoch: 50
+patience: 5
+
+freeze_param: [
+"frontend.upstream"
+]
+
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+frontend: s3prl
+frontend_conf:
+    frontend_conf:
+        upstream: wav2vec2_large_ll60k  # Note: If the upstream is changed, please change the input_size in the preencoder.
+    download_dir: ./hub
+    multilayer_feature: True
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 80
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
diff --git a/egs2/microsoft_speech/asr1/conf/tuning/train_lm_transformer.yaml b/egs2/microsoft_speech/asr1/conf/tuning/train_lm_transformer.yaml
new file mode 100644
index 00000000000..f7de29fd49f
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/conf/tuning/train_lm_transformer.yaml
@@ -0,0 +1,30 @@
+# Trained with Nvidia TESLA V100, with 16GM RAM, x4
+lm: transformer
+lm_conf:
+    pos_enc: null
+    embed_unit: 128
+    att_unit: 512
+    head: 8
+    unit: 2048
+    layer: 16
+    dropout_rate: 0.1
+
+# optimization related
+grad_clip: 5.0
+batch_type: numel
+batch_bins: 350000
+accum_grad: 2
+max_epoch: 25
+
+optim: adam
+optim_conf:
+   lr: 0.001
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 25000
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 10
diff --git a/egs2/microsoft_speech/asr1/db.sh b/egs2/microsoft_speech/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/microsoft_speech/asr1/local/data.sh b/egs2/microsoft_speech/asr1/local/data.sh
new file mode 100644
index 00000000000..073cad9c827
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/local/data.sh
@@ -0,0 +1,66 @@
+#!/usr/bin/env bash
+
+# Copyright 2020 Johns Hopkins University (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+# general configuration
+stage=0       # start from 0 if you need to start from data preparation
+stop_stage=100
+SECONDS=0
+lang=te  # te, ta
+ . utils/parse_options.sh || exit 1;
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+
+if [ -z "${MICROSOFT_SPEECH_CORPUS}" ]; then
+    log "Fill the value of 'MICROSOFT_SPEECH_CORPUS' of db.sh"
+    exit 1
+fi
+
+
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+
+log "data preparation started"
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then 
+    log "stage1: Download data to ${MICROSOFT_SPEECH_CORPUS}"
+    log "Download data from the link:  https://msropendata.com/datasets/7230b4b1-912d-400e-be58-f84e0512985e"
+    log "checking if the right directory structure exists"
+
+    if [ -d "${MICROSOFT_SPEECH_CORPUS}/${lang}-in-Train/Audios" ]
+    then 
+        echo "Data directory exists."
+    else 
+        echo "Error: Directory ${MICROSOFT_SPEECH_CORPUS}/${lang}-in-Train/Audios does not exists."
+    fi
+fi
+
+
+mkdir -p data
+mkdir -p data/dev_${lang}
+mkdir -p data/test_${lang}
+mkdir -p data/train_${lang}
+
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage2: Preparing data for microsoft_speech_corpus"
+    python local/process.py ${MICROSOFT_SPEECH_CORPUS} ${lang}
+    ### Running python script for preparing data in Kaldi style from Microsoft speech corpus   
+
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/microsoft_speech/asr1/local/path.sh b/egs2/microsoft_speech/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/microsoft_speech/asr1/local/process.py b/egs2/microsoft_speech/asr1/local/process.py
new file mode 100644
index 00000000000..eab7b85de9a
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/local/process.py
@@ -0,0 +1,155 @@
+import os
+import wave
+import contextlib
+from tqdm import tqdm
+import random
+import sys
+
+
+microsoft_speech_corpus_path = sys.argv[1]
+lang = sys.argv[2]
+
+train_folder = f"{microsoft_speech_corpus_path}/{lang}-in-Train"
+test_folder = f"{microsoft_speech_corpus_path}/{lang}-in-Test"
+
+train_audio_folder = os.path.join(train_folder, "Audios")
+dev_audio_folder = train_audio_folder
+test_audio_folder = os.path.join(test_folder, "Audios")
+train_tr_file = os.path.join(train_folder, "transcription.txt")
+test_tr_file = os.path.join(test_folder, "transcription.txt")
+
+train_dst_folder = f"data/train_{lang}"
+dev_dst_folder = f"data/dev_{lang}"
+test_dst_folder = f"data/test_{lang}"
+
+utt_idx = 1
+
+
+def get_duration(fname):
+    with contextlib.closing(wave.open(fname, "r")) as f:
+        frames = f.getnframes()
+        rate = f.getframerate()
+        duration = frames / float(rate)
+        return duration
+
+
+def get_dev_split():
+    train_audio_files = os.listdir(train_audio_folder)
+    dev_split = set()
+    # randomly split the training set of 40 hours to 37 train and 3 dev
+    dur = 7200
+    while dur >= 0:
+        f = random.choice(train_audio_files)
+        while f in dev_split:
+            f = random.choice(train_audio_files)
+        wav_file = os.path.join(train_audio_folder, f)
+        dev_split.add(f)
+        dur -= get_duration(wav_file)
+    return dev_split
+
+
+def get_train_dev_trs():
+    dev_split = get_dev_split()
+    train_transcriptions = []
+    dev_transcriptions = []
+    train_fnames = []
+    dev_fnames = []
+    train_utts = []
+    dev_utts = []
+    with open(train_tr_file, encoding="utf-8") as f:
+        for line in f:
+            try:
+                line = line.strip()
+                fname, text = line.split("\t")
+                global utt_idx
+                utt_id = f"id_{utt_idx:07d}"
+                line = utt_id + " " + text
+                fpath = fname + ".wav"
+                if fpath in dev_split:
+                    dev_fnames.append(fname)
+                    dev_utts.append(utt_id)
+                    dev_transcriptions.append(line)
+                else:
+                    train_fnames.append(fname)
+                    train_utts.append(utt_id)
+                    train_transcriptions.append(line)
+                utt_idx += 1
+            except Exception as e:
+                print(f"Cannot process {line}")
+    return (
+        train_transcriptions,
+        train_fnames,
+        train_utts,
+        dev_transcriptions,
+        dev_fnames,
+        dev_utts,
+    )
+
+
+def get_test_trs():
+    test_transcriptions = []
+    test_fnames = []
+    test_utts = []
+    with open(test_tr_file, encoding="utf-8") as f:
+        for line in f:
+            try:
+                line = line.strip()
+                fname, text = line.split("\t")
+                global utt_idx
+                utt_id = f"id_{utt_idx:07d}"
+                line = utt_id + " " + text
+                test_fnames.append(fname)
+                test_utts.append(utt_id)
+                test_transcriptions.append(line)
+                utt_idx += 1
+            except Exception as e:
+                print(f"Cannot process {line}")
+    return test_transcriptions, test_fnames, test_utts
+
+
+def prepare_files(dest_folder, audio_folder, transcriptions, fnames, utt_ids):
+    with open(os.path.join(dest_folder, "text"), "w", encoding="utf-8") as f:
+        for line in transcriptions:
+            f.write(line)
+            f.write("\n")
+    with open(os.path.join(dest_folder, "spk2utt"), "w", encoding="utf-8") as f:
+        for idx in utt_ids:
+            line = idx + " " + idx
+            f.write(line)
+            f.write("\n")
+    with open(os.path.join(dest_folder, "utt2spk"), "w", encoding="utf-8") as f:
+        for idx in utt_ids:
+            line = idx + " " + idx
+            f.write(line)
+            f.write("\n")
+    with open(os.path.join(dest_folder, "utt2gender"), "w", encoding="utf-8") as f:
+        for idx in utt_ids:
+            line = idx + " " + "m"
+            f.write(line)
+            f.write("\n")
+    with open(os.path.join(dest_folder, "wav.scp"), "w", encoding="utf-8") as f:
+        for (idx, fname) in zip(utt_ids, fnames):
+            fpath = os.path.join(audio_folder, fname + ".wav")
+            line = idx + " " + fpath
+            f.write(line)
+            f.write("\n")
+
+
+(
+    train_transcriptions,
+    train_fnames,
+    train_utts,
+    dev_transcriptions,
+    dev_fnames,
+    dev_utts,
+) = get_train_dev_trs()
+test_transcriptions, test_fnames, test_utts = get_test_trs()
+prepare_files(
+    train_dst_folder, train_audio_folder, train_transcriptions, train_fnames, train_utts
+)
+prepare_files(
+    dev_dst_folder, dev_audio_folder, dev_transcriptions, dev_fnames, dev_utts
+)
+prepare_files(
+    test_dst_folder, test_audio_folder, test_transcriptions, test_fnames, test_utts
+)
diff --git a/egs2/microsoft_speech/asr1/path.sh b/egs2/microsoft_speech/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/microsoft_speech/asr1/pyscripts b/egs2/microsoft_speech/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/microsoft_speech/asr1/run.sh b/egs2/microsoft_speech/asr1/run.sh
new file mode 100755
index 00000000000..8b403b31a82
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/run.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+lang=te #te, ta
+
+train_set=train_"$(echo "${lang}" | tr - _)"
+train_dev=dev_"$(echo "${lang}" | tr - _)"
+test_set="${train_dev} test_$(echo ${lang} | tr - _)"
+
+asr_config=conf/train_asr.yaml
+lm_config=conf/train_lm.yaml
+inference_config=conf/decode_asr.yaml
+
+nbpe=200
+
+
+./asr.sh \
+    --ngpu 4 \
+    --lang "${lang}" \
+    --local_data_opts "--lang ${lang}" \
+    --use_lm true \
+    --lm_config "${lm_config}" \
+    --token_type bpe \
+    --nbpe $nbpe \
+    --feats_type raw \
+    --speed_perturb_factors "0.9 1.0 1.1" \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${train_dev}" \
+    --test_sets "${test_set}" \
+    --bpe_train_text "data/${train_set}/text" \
+    --lm_train_text "data/${train_set}/text" "$@"
+
diff --git a/egs2/microsoft_speech/asr1/scripts b/egs2/microsoft_speech/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/microsoft_speech/asr1/steps b/egs2/microsoft_speech/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/microsoft_speech/asr1/utils b/egs2/microsoft_speech/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/mini_an4/asr1/transfer_learning.md b/egs2/mini_an4/asr1/transfer_learning.md
new file mode 100644
index 00000000000..cefb53835a3
--- /dev/null
+++ b/egs2/mini_an4/asr1/transfer_learning.md
@@ -0,0 +1,41 @@
+## Use transfer learning for ASR in ESPnet2
+
+In that tutorial, we will introduce several options to use pre-trained models/parameters for Automatic Speech Recognition (ASR) in ESPnet. Available options are : 
+- use a local model you (or a collegue) have already trained,
+- use a trained model from ESPnet repository on HuggingFace.
+
+We note that this is done for ASR training, so at __stage 11__ of ESPnet2 models' recipe.
+
+### 0. Why using such (pre-)trained models ? 
+
+Several projects may involve making use of previously trained models, this is the reason why we developed ESPnet repository on HuggingFace for instance.
+Example of use cases are listed below (non-exhaustive):
+- target a low resource language, a model trained from scratch may perform badly if trained with only few hours of data,
+- study robustness to shifts (domain, language ... shifts) of a model,
+- use massively trained multilingual ASR models ...
+
+### 1. Use a local model that you have already trained. 
+
+__Step 1__ : make sure your ASR model file has the proper ESPnet format (should be ok if trained with ESPnet). It just needs to be a ".pth" (or ".pt" or other extension) type pytorch model.
+
+__Step 2__ : add the parameter ```--pretrained_model path/to/your/pretrained/model/file.pth``` to run.sh. 
+
+__Step 3__ : step 2 will initialize your new model with the parameters of the pre-trained model. Thus your new model will be trained with a strong initialization. However, if your new model has different parameter sizes for some parts of the model (e.g. last projection layer could be modified ...). This will lead to an error because of mismatches in size. To prevent this to happen, you can add the parameter ```--ignore_init_mismatch true``` in run.sh.
+
+__Step 4 (Optional)__ : if you only want to use some specific parts of the pre-trained model, or exclude specific parts, you can specify it in the ```--pretrained_model``` argument by passing the component names with the following syntax : ```--pretrained_model <file_path>:<src_key>:<dst_key>:<exclude_Keys>```. ```src_key``` are the parameters you want to keep from the pre-trained model. ```dst_key``` are the parameters you want to initialize in the new model with the ```src_key```parameters. And ```exclude_Keys``` are the parameters from the pre-trained model that you do not want to use. You can leave ```src_key``` and ```dst_key``` fields empty and just fill ```exclude_Keys``` with the parameters that you want to drop. For instance, if you want to re-use encoder parameters but not decoder ones, syntax will be ```--pretrained_model <file_path>:::decoder```.  You can see the argument expected format in more details [here](https://github.com/espnet/espnet/blob/e76c78c0c661ab37cc081d46d9b059dcb31292fe/espnet2/torch_utils/load_pretrained_model.py#L43-L53).
+
+__Additional note about the ```--ignore_init_mismatch true``` option :__ This option is very convenient because in lots of transfer learning use cases, you will aim to use a model trained on a language X (e.g. X=English) for another language Y. Language Y may have a vocabulary (set of tokens) different from language X, for instance if you target Y=Totonac, a Mexican low resource language, your model may be stronger if you use a different set of bpes/tokens that the one used to train the English model. In that situation, the last layer (projection to vocabulary space) of your ASR model needs to be initialized from scratch and may be different in shape than the one of the English model. For that reason, you should use the ```--ignore_init_mismatch true``` option. It also enables to handle the case where the scripts are differents from languages X to Y.
+
+
+### 2. Use a trained model from ESPnet repository on HuggingFace.
+
+[ESPnet repository on HuggingFace](https://huggingface.co/espnet) contains more than 200 pre-trained models, for a wide variety of languages and dataset, and we are actively expanding this repositories with new models every week! This enables any user to perform transfer learning with a wide variety of models without having to re-train them. 
+In order to use our pre-trained models, the first step is to download the ".pth" model file from the [HugginFace page](https://huggingface.co/espnet). There are several easy ways to do it, either by manually downloading them (e.g. ```wget https://huggingface.co/espnet/bn_openslr53/blob/main/exp/asr_train_asr_raw_bpe1000/41epoch.pth```), cloning it (```git clone https://huggingface.co/espnet/bn_openslr53```) or downloading it through an ESPnet recipe (described in the models' pages on HuggingFace): 
+```cd espnet
+git checkout fa1b865352475b744c37f70440de1cc6b257ba70
+pip install -e .
+cd egs2/bn_openslr53/asr1
+./run.sh --skip_data_prep false --skip_train true --download_model espnet/bn_openslr53
+```
+
+Then, as you have the ".pth" model file, you can follow the steps 1 to 4 from the previous section in order to use this pre-train model.
diff --git a/egs2/mini_an4/enh_asr1/cmd.sh b/egs2/mini_an4/enh_asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/mini_an4/enh_asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/mini_an4/enh_asr1/conf/fbank.conf b/egs2/mini_an4/enh_asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/mini_an4/enh_asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/mini_an4/enh_asr1/conf/pbs.conf b/egs2/mini_an4/enh_asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/mini_an4/enh_asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/mini_an4/enh_asr1/conf/pitch.conf b/egs2/mini_an4/enh_asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/mini_an4/enh_asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/mini_an4/enh_asr1/conf/queue.conf b/egs2/mini_an4/enh_asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/mini_an4/enh_asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/mini_an4/enh_asr1/conf/slurm.conf b/egs2/mini_an4/enh_asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/mini_an4/enh_asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/mini_an4/enh_asr1/db.sh b/egs2/mini_an4/enh_asr1/db.sh
new file mode 120000
index 00000000000..3090b1bc350
--- /dev/null
+++ b/egs2/mini_an4/enh_asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/enh_asr1/db.sh
\ No newline at end of file
diff --git a/egs2/mini_an4/enh_asr1/downloads.tar.gz b/egs2/mini_an4/enh_asr1/downloads.tar.gz
new file mode 120000
index 00000000000..e916a05f1df
--- /dev/null
+++ b/egs2/mini_an4/enh_asr1/downloads.tar.gz
@@ -0,0 +1 @@
+../../../egs/mini_an4/asr1/downloads.tar.gz
\ No newline at end of file
diff --git a/egs2/mini_an4/enh_asr1/enh_asr.sh b/egs2/mini_an4/enh_asr1/enh_asr.sh
new file mode 120000
index 00000000000..b00d9b13ef7
--- /dev/null
+++ b/egs2/mini_an4/enh_asr1/enh_asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/enh_asr1/enh_asr.sh
\ No newline at end of file
diff --git a/egs2/mini_an4/enh_asr1/local/data.sh b/egs2/mini_an4/enh_asr1/local/data.sh
new file mode 100755
index 00000000000..4b35a1668e0
--- /dev/null
+++ b/egs2/mini_an4/enh_asr1/local/data.sh
@@ -0,0 +1,94 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+stage=1
+stop_stage=100
+
+an4_root=./downloads/an4
+
+log "$0 $*"
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+. ./path.sh
+. ./cmd.sh
+
+train_set="train_nodev"
+train_dev="train_dev"
+
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: Untar downloads.tar.gz"
+    if [ ! -e downloads/ ]; then
+        tar -xvf downloads.tar.gz
+    fi
+fi
+
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: Data preparation"
+    mkdir -p data/{train,test}
+
+    if [ ! -f ${an4_root}/README ]; then
+        echo Cannot find an4 root! Exiting...
+        exit 1
+    fi
+
+    python3 local/data_prep.py ${an4_root} sph2pipe
+
+    for x in test train; do
+        for f in text wav.scp utt2spk; do
+            sort data/${x}/${f} -o data/${x}/${f}
+        done
+        utils/utt2spk_to_spk2utt.pl data/${x}/utt2spk > data/${x}/spk2utt
+    done
+
+    # make a dev set
+    utils/subset_data_dir.sh --first data/train 1 data/${train_dev}
+    n=$(($(wc -l < data/train/text) - 1))
+    utils/subset_data_dir.sh --last data/train ${n} data/${train_set}
+
+    # Create "test_seg" in order to test the use case of segments
+    rm -rf data/test_seg
+    utils/copy_data_dir.sh data/test data/test_seg
+    <data/test/wav.scp awk '{ for(i=2;i<=NF;i++){a=a " " $i}; print($1 "_org", a) }' > data/test_seg/wav.scp
+    cat << EOF > data/test_seg/segments
+fcaw-cen8-b fcaw-cen8-b_org 0.0 2.9
+mmxg-cen8-b mmxg-cen8-b_org 0.0 2.3
+EOF
+
+    # for enh task
+    for x in test ${train_set} ${train_dev} test test_seg; do
+        for f in wav.scp text utt2spk segments; do
+            [ ! -f data/${x}/${f} ] && continue
+            mv data/${x}/${f} data/${x}/${f}.old
+            if [ $f = "segments" ]; then
+                <data/${x}/${f}.old awk '{$1=$1"_SIMU"; $2=$2"_SIMU"; print($0)}' > data/${x}/${f}
+            else
+                <data/${x}/${f}.old awk '{$1=$1"_SIMU"; print($0)}' > data/${x}/${f}
+            fi
+            rm data/${x}/${f}.old
+        done
+        utils/utt2spk_to_spk2utt.pl data/${x}/utt2spk > data/${x}/spk2utt
+
+        cp data/${x}/wav.scp data/${x}/spk1.scp
+        <data/${x}/wav.scp awk '{print($1, "SIMU")}' > data/${x}/utt2category
+    done
+fi
+
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/mini_an4/enh_asr1/local/data_prep.py b/egs2/mini_an4/enh_asr1/local/data_prep.py
new file mode 120000
index 00000000000..d416349ede8
--- /dev/null
+++ b/egs2/mini_an4/enh_asr1/local/data_prep.py
@@ -0,0 +1 @@
+../../../../egs/an4/asr1/local/data_prep.py
\ No newline at end of file
diff --git a/egs2/mini_an4/enh_asr1/local/download_and_untar.sh b/egs2/mini_an4/enh_asr1/local/download_and_untar.sh
new file mode 120000
index 00000000000..40bf437ab02
--- /dev/null
+++ b/egs2/mini_an4/enh_asr1/local/download_and_untar.sh
@@ -0,0 +1 @@
+../../../../egs/an4/asr1/local/download_and_untar.sh
\ No newline at end of file
diff --git a/egs2/mini_an4/enh_asr1/local/path.sh b/egs2/mini_an4/enh_asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/mini_an4/enh_asr1/path.sh b/egs2/mini_an4/enh_asr1/path.sh
new file mode 120000
index 00000000000..f2720c6899b
--- /dev/null
+++ b/egs2/mini_an4/enh_asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/enh_asr1/path.sh
\ No newline at end of file
diff --git a/egs2/mini_an4/enh_asr1/pyscripts b/egs2/mini_an4/enh_asr1/pyscripts
new file mode 120000
index 00000000000..008f9bd4bc5
--- /dev/null
+++ b/egs2/mini_an4/enh_asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/enh_asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/mini_an4/enh_asr1/run.sh b/egs2/mini_an4/enh_asr1/run.sh
new file mode 100755
index 00000000000..7b01cd09b87
--- /dev/null
+++ b/egs2/mini_an4/enh_asr1/run.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+./enh_asr.sh \
+    --lang en \
+    --train_set train_nodev \
+    --valid_set train_dev \
+    --test_sets "train_dev test test_seg" \
+    --lm_train_text "data/train_nodev/text" "$@"
diff --git a/egs2/mini_an4/enh_asr1/scripts b/egs2/mini_an4/enh_asr1/scripts
new file mode 120000
index 00000000000..6c0f28ef23c
--- /dev/null
+++ b/egs2/mini_an4/enh_asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/enh_asr1/scripts
\ No newline at end of file
diff --git a/egs2/mini_an4/enh_asr1/steps b/egs2/mini_an4/enh_asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/mini_an4/enh_asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/mini_an4/enh_asr1/utils b/egs2/mini_an4/enh_asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/mini_an4/enh_asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/mini_an4/st1/cmd.sh b/egs2/mini_an4/st1/cmd.sh
new file mode 100755
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/mini_an4/st1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/mini_an4/st1/conf/fbank.conf b/egs2/mini_an4/st1/conf/fbank.conf
new file mode 100755
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/mini_an4/st1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/mini_an4/st1/conf/pbs.conf b/egs2/mini_an4/st1/conf/pbs.conf
new file mode 100755
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/mini_an4/st1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/mini_an4/st1/conf/pitch.conf b/egs2/mini_an4/st1/conf/pitch.conf
new file mode 100755
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/mini_an4/st1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/mini_an4/st1/conf/queue.conf b/egs2/mini_an4/st1/conf/queue.conf
new file mode 100755
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/mini_an4/st1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/mini_an4/st1/conf/slurm.conf b/egs2/mini_an4/st1/conf/slurm.conf
new file mode 100755
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/mini_an4/st1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/mini_an4/st1/conf/train_st.yaml b/egs2/mini_an4/st1/conf/train_st.yaml
new file mode 100644
index 00000000000..4e2d9531fd8
--- /dev/null
+++ b/egs2/mini_an4/st1/conf/train_st.yaml
@@ -0,0 +1,6 @@
+model_conf:
+    asr_weight: 0.3
+    mt_weight: 0.0
+    mtlalpha: 1.0
+    lsm_weight: 0.1
+    length_normalized_loss: false
diff --git a/egs2/mini_an4/st1/conf/train_st_streaming.yaml b/egs2/mini_an4/st1/conf/train_st_streaming.yaml
new file mode 100644
index 00000000000..878d1d8f1a9
--- /dev/null
+++ b/egs2/mini_an4/st1/conf/train_st_streaming.yaml
@@ -0,0 +1,9 @@
+model_conf:
+    asr_weight: 0.3
+    mt_weight: 0.0
+    mtlalpha: 1.0
+    lsm_weight: 0.1
+    length_normalized_loss: false
+encoder: contextual_block_transformer
+decoder: transformer
+max_epoch: 5
diff --git a/egs2/mini_an4/st1/db.sh b/egs2/mini_an4/st1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/mini_an4/st1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/mini_an4/st1/downloads.tar.gz b/egs2/mini_an4/st1/downloads.tar.gz
new file mode 120000
index 00000000000..e916a05f1df
--- /dev/null
+++ b/egs2/mini_an4/st1/downloads.tar.gz
@@ -0,0 +1 @@
+../../../egs/mini_an4/asr1/downloads.tar.gz
\ No newline at end of file
diff --git a/egs2/mini_an4/st1/local/data.sh b/egs2/mini_an4/st1/local/data.sh
new file mode 100755
index 00000000000..030b7c30df5
--- /dev/null
+++ b/egs2/mini_an4/st1/local/data.sh
@@ -0,0 +1,85 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+stage=1
+stop_stage=100
+
+an4_root=./downloads/an4
+
+log "$0 $*"
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+. ./path.sh
+. ./cmd.sh
+
+train_set="train_nodev"
+train_dev="train_dev"
+
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: Untar downloads.tar.gz"
+    if [ ! -e downloads/ ]; then
+        tar -xvf downloads.tar.gz
+    fi
+fi
+
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: Data preparation"
+    mkdir -p data/{train,test}
+
+    if [ ! -f ${an4_root}/README ]; then
+        echo Cannot find an4 root! Exiting...
+        exit 1
+    fi
+
+    python3 local/data_prep.py ${an4_root} sph2pipe
+
+    for x in test train; do
+        for f in text wav.scp utt2spk; do
+            sort data/${x}/${f} -o data/${x}/${f}
+        done
+        utils/utt2spk_to_spk2utt.pl data/${x}/utt2spk > data/${x}/spk2utt
+    done
+
+    # make a dev set
+    utils/subset_data_dir.sh --first data/train 1 data/${train_dev}
+    n=$(($(wc -l < data/train/text) - 1))
+    utils/subset_data_dir.sh --last data/train ${n} data/${train_set}
+
+    # Create "test_seg" in order to test the use case of segments
+    rm -rf data/test_seg
+    utils/copy_data_dir.sh data/test data/test_seg
+    <data/test/wav.scp awk '{ for(i=2;i<=NF;i++){a=a " " $i}; print($1 "_org", a) }' > data/test_seg/wav.scp
+    cat << EOF > data/test_seg/segments
+fcaw-cen8-b fcaw-cen8-b_org 0.0 2.9
+mmxg-cen8-b mmxg-cen8-b_org 0.0 2.3
+EOF
+
+    # for enh task
+    for x in test ${train_set} ${train_dev}; do
+        cp data/${x}/wav.scp data/${x}/spk1.scp
+    done
+fi
+
+for x in test test_seg ${train_set} ${train_dev}; do
+	cp data/${x}/text data/${x}/text.lc.rm.en
+	cp data/${x}/text data/${x}/text.tc.en
+done
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/mini_an4/st1/local/data_prep.py b/egs2/mini_an4/st1/local/data_prep.py
new file mode 120000
index 00000000000..d416349ede8
--- /dev/null
+++ b/egs2/mini_an4/st1/local/data_prep.py
@@ -0,0 +1 @@
+../../../../egs/an4/asr1/local/data_prep.py
\ No newline at end of file
diff --git a/egs2/mini_an4/st1/local/download_and_untar.sh b/egs2/mini_an4/st1/local/download_and_untar.sh
new file mode 120000
index 00000000000..40bf437ab02
--- /dev/null
+++ b/egs2/mini_an4/st1/local/download_and_untar.sh
@@ -0,0 +1 @@
+../../../../egs/an4/asr1/local/download_and_untar.sh
\ No newline at end of file
diff --git a/egs2/mini_an4/st1/local/path.sh b/egs2/mini_an4/st1/local/path.sh
new file mode 100755
index 00000000000..e69de29bb2d
diff --git a/egs2/mini_an4/st1/path.sh b/egs2/mini_an4/st1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/mini_an4/st1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/mini_an4/st1/pyscripts b/egs2/mini_an4/st1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/mini_an4/st1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/mini_an4/st1/run.sh b/egs2/mini_an4/st1/run.sh
new file mode 100755
index 00000000000..068f2cda5d2
--- /dev/null
+++ b/egs2/mini_an4/st1/run.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+src_case=lc.rm
+tgt_case=lc.rm
+st_config=conf/train_st.yaml
+
+./st.sh \
+    --src_lang en \
+    --tgt_lang en \
+    --src_token_type "bpe" \
+    --src_nbpe 30 \
+    --tgt_token_type "bpe" \
+    --tgt_nbpe 30 \
+    --src_case ${src_case} \
+    --tgt_case ${tgt_case} \
+    --src_bpe_train_text "data/train_nodev/text.${src_case}.en" \
+    --tgt_bpe_train_text "data/train_nodev/text.${tgt_case}.en" \
+    --use_lm false \
+    --token_joint false \
+    --st_config "${st_config}" \
+    --train_set "train_nodev" \
+    --valid_set "train_dev" \
+    --test_sets "train_dev test test_seg" \
+    --lm_train_text "data/train_nodev/text.${tgt_case}.en" "$@"
diff --git a/egs2/mini_an4/st1/scripts b/egs2/mini_an4/st1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/mini_an4/st1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/mini_an4/st1/st.sh b/egs2/mini_an4/st1/st.sh
new file mode 120000
index 00000000000..5c7465739e3
--- /dev/null
+++ b/egs2/mini_an4/st1/st.sh
@@ -0,0 +1 @@
+../../TEMPLATE/st1/st.sh
\ No newline at end of file
diff --git a/egs2/mini_an4/st1/steps b/egs2/mini_an4/st1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/mini_an4/st1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/mini_an4/st1/utils b/egs2/mini_an4/st1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/mini_an4/st1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/mini_librispeech/diar1/local/simulation/make_mixture.py b/egs2/mini_librispeech/diar1/local/simulation/make_mixture.py
index 7693f691b97..ad16f72ec18 100755
--- a/egs2/mini_librispeech/diar1/local/simulation/make_mixture.py
+++ b/egs2/mini_librispeech/diar1/local/simulation/make_mixture.py
@@ -106,8 +106,8 @@
     else:
         noise_data = noise_data[:maxlen]
     # noise power is scaled according to selected SNR, then mixed
-    signal_power = np.sum(mixture ** 2) / len(mixture)
-    noise_power = np.sum(noise_data ** 2) / len(noise_data)
+    signal_power = np.sum(mixture**2) / len(mixture)
+    noise_power = np.sum(noise_data**2) / len(noise_data)
     scale = math.sqrt(math.pow(10, -noise_snr / 10) * signal_power / noise_power)
     mixture += noise_data * scale
     # output the wav file and write wav.scp
diff --git a/egs2/mini_librispeech/diar1/local/simulation/make_mixture_nooverlap.py b/egs2/mini_librispeech/diar1/local/simulation/make_mixture_nooverlap.py
index 7d2d73c9916..9b8c24cd87f 100755
--- a/egs2/mini_librispeech/diar1/local/simulation/make_mixture_nooverlap.py
+++ b/egs2/mini_librispeech/diar1/local/simulation/make_mixture_nooverlap.py
@@ -90,8 +90,8 @@
     else:
         noise_data = noise_data[:maxlen]
     # noise power is scaled according to selected SNR, then mixed
-    signal_power = np.sum(mixture ** 2) / len(mixture)
-    noise_power = np.sum(noise_data ** 2) / len(noise_data)
+    signal_power = np.sum(mixture**2) / len(mixture)
+    noise_power = np.sum(noise_data**2) / len(noise_data)
     scale = math.sqrt(math.pow(10, -noise_snr / 10) * signal_power / noise_power)
     mixture += noise_data * scale
     # output the wav file and write wav.scp
diff --git a/egs2/ml_openslr63/asr1/README.md b/egs2/ml_openslr63/asr1/README.md
new file mode 100644
index 00000000000..35485aec30c
--- /dev/null
+++ b/egs2/ml_openslr63/asr1/README.md
@@ -0,0 +1,99 @@
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+# RESULTS
+## Environments
+- date: `Sat Mar 19 20:34:49 UTC 2022`
+- python version: `3.9.10 | packaged by conda-forge | (main, Feb  1 2022, 21:24:11)  [GCC 9.4.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.10.1`
+- Git hash: `d2410457152872f63c51ee76ed746a6ea3153f09`
+  - Commit date: `Sat Mar 19 09:04:54 2022 +0000`
+- Pretrained Model
+  - Hugging Face Hub: 
+  https://huggingface.co/espnet/ml_openslr63
+
+## asr_train_asr_conformer_s3prlfrontend_hubert_fused_raw_ml_bpe150_sp
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_ml_bpe150_valid.loss.ave_asr_model_valid.acc.ave/dev_ml|369|2345|75.2|21.8|3.0|2.4|27.2|71.5|
+|decode_asr_lm_lm_train_lm_ml_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_ml|1062|6136|67.0|28.7|4.3|2.6|35.6|71.8|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_ml_bpe150_valid.loss.ave_asr_model_valid.acc.ave/dev_ml|369|21321|96.1|2.2|1.7|0.9|4.7|71.5|
+|decode_asr_lm_lm_train_lm_ml_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_ml|1062|57065|93.5|3.2|3.3|1.3|7.7|71.8|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_ml_bpe150_valid.loss.ave_asr_model_valid.acc.ave/dev_ml|369|13402|93.5|4.4|2.1|0.9|7.4|71.3|
+|decode_asr_lm_lm_train_lm_ml_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_ml|1062|35911|89.9|6.3|3.8|1.3|11.4|70.4|
+
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+# RESULTS
+## Environments
+- date: `Sat Mar 19 07:22:48 UTC 2022`
+- python version: `3.9.10 | packaged by conda-forge | (main, Feb  1 2022, 21:24:11)  [GCC 9.4.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.10.1`
+- Git hash: `813ee348e36db8a6f8d0d717be8767f938b2e62b`
+  - Commit date: `Fri Mar 18 11:12:20 2022 -0400`
+
+## asr_train_asr_conformer_s3prlfrontend_hubert_raw_ml_bpe150_sp
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_ml_bpe150_valid.loss.ave_asr_model_valid.acc.ave/dev_ml|369|2345|71.4|24.4|4.2|2.5|31.1|72.6|
+|decode_asr_lm_lm_train_lm_ml_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_ml|1062|6136|61.8|32.1|6.1|2.0|40.3|73.5|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_ml_bpe150_valid.loss.ave_asr_model_valid.acc.ave/dev_ml|369|21321|94.5|2.3|3.3|1.0|6.5|72.6|
+|decode_asr_lm_lm_train_lm_ml_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_ml|1062|57065|90.9|3.4|5.8|1.1|10.3|73.5|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_ml_bpe150_valid.loss.ave_asr_model_valid.acc.ave/dev_ml|369|13402|91.3|4.5|4.1|0.9|9.6|72.6|
+|decode_asr_lm_lm_train_lm_ml_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_ml|1062|35911|86.7|6.6|6.7|0.9|14.1|72.1|
+
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+# RESULTS
+## Environments
+- date: `Fri Mar 18 17:25:39 UTC 2022`
+- python version: `3.9.10 | packaged by conda-forge | (main, Feb  1 2022, 21:24:11)  [GCC 9.4.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.10.1`
+- Git hash: `9cb00370db63ced70ee39e1a2ba3137311842d44`
+  - Commit date: `Fri Mar 18 10:47:05 2022 -0400`
+
+## asr_train_asr_conformer5_raw_ml_bpe150_sp
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_ml_bpe150_valid.loss.ave_asr_model_valid.acc.ave/dev_ml|369|2345|71.0|25.5|3.5|2.4|31.4|73.2|
+|decode_asr_lm_lm_train_lm_ml_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_ml|1062|6136|63.0|32.1|4.9|2.2|39.2|73.2|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_ml_bpe150_valid.loss.ave_asr_model_valid.acc.ave/dev_ml|369|21321|94.3|3.3|2.4|1.3|7.0|73.2|
+|decode_asr_lm_lm_train_lm_ml_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_ml|1062|57065|91.1|4.8|4.0|1.5|10.4|73.2|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_ml_bpe150_valid.loss.ave_asr_model_valid.acc.ave/dev_ml|369|13402|90.7|6.2|3.1|1.4|10.6|72.9|
+|decode_asr_lm_lm_train_lm_ml_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_ml|1062|35911|86.7|8.6|4.6|1.6|14.8|71.8|
+
diff --git a/egs2/ml_openslr63/asr1/asr.sh b/egs2/ml_openslr63/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/ml_openslr63/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/ml_openslr63/asr1/cmd.sh b/egs2/ml_openslr63/asr1/cmd.sh
new file mode 120000
index 00000000000..f77e339f822
--- /dev/null
+++ b/egs2/ml_openslr63/asr1/cmd.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/cmd.sh
\ No newline at end of file
diff --git a/egs2/ml_openslr63/asr1/conf/decode_asr.yaml b/egs2/ml_openslr63/asr1/conf/decode_asr.yaml
new file mode 120000
index 00000000000..f3f59d5ac2b
--- /dev/null
+++ b/egs2/ml_openslr63/asr1/conf/decode_asr.yaml
@@ -0,0 +1 @@
+tuning/decode_transformer.yaml
\ No newline at end of file
diff --git a/egs2/ml_openslr63/asr1/conf/fbank.conf b/egs2/ml_openslr63/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/ml_openslr63/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/ml_openslr63/asr1/conf/pbs.conf b/egs2/ml_openslr63/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/ml_openslr63/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/ml_openslr63/asr1/conf/pitch.conf b/egs2/ml_openslr63/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/ml_openslr63/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/ml_openslr63/asr1/conf/queue.conf b/egs2/ml_openslr63/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/ml_openslr63/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/ml_openslr63/asr1/conf/slurm.conf b/egs2/ml_openslr63/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/ml_openslr63/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/ml_openslr63/asr1/conf/train_asr.yaml b/egs2/ml_openslr63/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..56ea1bf0c00
--- /dev/null
+++ b/egs2/ml_openslr63/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+./tuning/train_asr_conformer.yaml
\ No newline at end of file
diff --git a/egs2/ml_openslr63/asr1/conf/train_lm.yaml b/egs2/ml_openslr63/asr1/conf/train_lm.yaml
new file mode 100644
index 00000000000..bda020d1c57
--- /dev/null
+++ b/egs2/ml_openslr63/asr1/conf/train_lm.yaml
@@ -0,0 +1,14 @@
+lm_conf:
+    nlayers: 2
+    unit: 650
+optim: sgd        # or adam
+batch_type: folded
+batch_size: 64   # batch size in LM training
+max_epoch: 30      # if the data size is large, we can reduce this
+patience: 3
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
diff --git a/egs2/ml_openslr63/asr1/conf/tuning/decode_transformer.yaml b/egs2/ml_openslr63/asr1/conf/tuning/decode_transformer.yaml
new file mode 100644
index 00000000000..d89db079882
--- /dev/null
+++ b/egs2/ml_openslr63/asr1/conf/tuning/decode_transformer.yaml
@@ -0,0 +1,7 @@
+batch_size: 1
+beam_size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.5
+lm_weight: 0.3
diff --git a/egs2/ml_openslr63/asr1/conf/tuning/train_asr_conformer.yaml b/egs2/ml_openslr63/asr1/conf/tuning/train_asr_conformer.yaml
new file mode 100644
index 00000000000..b226d1b519f
--- /dev/null
+++ b/egs2/ml_openslr63/asr1/conf/tuning/train_asr_conformer.yaml
@@ -0,0 +1,78 @@
+# network architecture
+
+# frontend related
+frontend: default
+frontend_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 160
+
+# encoder related
+encoder: conformer
+encoder_conf:
+    input_layer: conv2d
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    macaron_style: true
+    use_cnn_module: true
+    cnn_module_kernel: 15
+
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: embed
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# optimization related
+optim: adam
+accum_grad: 1
+grad_clip: 3
+max_epoch: 50
+optim_conf:
+    lr: 4.0
+scheduler: noamlr
+scheduler_conf:
+    model_size: 256
+    warmup_steps: 25000
+
+# minibatch related
+batch_type: numel
+batch_bins: 2000000
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/ml_openslr63/asr1/conf/tuning/train_asr_conformer_s3prlfrontend_hubert.yaml b/egs2/ml_openslr63/asr1/conf/tuning/train_asr_conformer_s3prlfrontend_hubert.yaml
new file mode 100644
index 00000000000..6266111739d
--- /dev/null
+++ b/egs2/ml_openslr63/asr1/conf/tuning/train_asr_conformer_s3prlfrontend_hubert.yaml
@@ -0,0 +1,87 @@
+# network architecture
+
+freeze_param: [
+"frontend.upstream"
+]
+
+# frontend related
+frontend: s3prl
+frontend_conf:
+   frontend_conf:
+      upstream: hubert_large_ll60k  # Note: If the upstream is changed, please change the input_size in the preencoder.
+   download_dir: ./hub
+   multilayer_feature: True
+
+preencoder: linear
+preencoder_conf:
+   input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+   output_size: 80
+
+# encoder related
+encoder: conformer
+encoder_conf:
+    input_layer: conv2d
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    macaron_style: true
+    use_cnn_module: true
+    cnn_module_kernel: 15
+
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: embed
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# optimization related
+optim: adam
+accum_grad: 1
+grad_clip: 3
+max_epoch: 50
+optim_conf:
+    lr: 4.0
+scheduler: noamlr
+scheduler_conf:
+    warmup_steps: 25000
+
+# minibatch related
+batch_type: numel
+batch_bins: 2000000
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/ml_openslr63/asr1/conf/tuning/train_asr_conformer_s3prlfrontend_hubert_fused.yaml b/egs2/ml_openslr63/asr1/conf/tuning/train_asr_conformer_s3prlfrontend_hubert_fused.yaml
new file mode 100644
index 00000000000..9998618bf28
--- /dev/null
+++ b/egs2/ml_openslr63/asr1/conf/tuning/train_asr_conformer_s3prlfrontend_hubert_fused.yaml
@@ -0,0 +1,93 @@
+# network architecture
+
+# frontend related
+frontend: fused
+frontend_conf:
+   frontends:
+    - frontend_type: s3prl
+      frontend_conf:
+        upstream: hubert_large_ll60k
+      download_dir: ./hub
+      multilayer_feature: True
+      
+    - frontend_type: default
+      n_fft: 512
+      win_length: 400
+      hop_length: 160
+
+   align_method: linear_projection
+   proj_dim: 100 
+   
+preencoder: linear
+preencoder_conf:
+   input_size: 200 # Note: If the upstream is changed, please change this value accordingly.
+   output_size: 80
+
+# encoder related
+encoder: conformer
+encoder_conf:
+    input_layer: conv2d
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    macaron_style: true
+    use_cnn_module: true
+    cnn_module_kernel: 15
+
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: embed
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# optimization related
+optim: adam
+accum_grad: 1
+grad_clip: 3
+max_epoch: 50
+optim_conf:
+    lr: 0.5
+scheduler: noamlr
+scheduler_conf:
+    warmup_steps: 2500
+
+# minibatch related
+batch_type: numel
+batch_bins: 2000000
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/ml_openslr63/asr1/db.sh b/egs2/ml_openslr63/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/ml_openslr63/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/ml_openslr63/asr1/local/data.sh b/egs2/ml_openslr63/asr1/local/data.sh
new file mode 100755
index 00000000000..6b549f1f096
--- /dev/null
+++ b/egs2/ml_openslr63/asr1/local/data.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+# general configuration
+stage=0       # start from 0 if you need to start from data preparation
+stop_stage=1
+# inclusive, was 100
+SECONDS=0
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+. utils/parse_options.sh
+
+log "data preparation started"
+
+mkdir -p ${MALAYALAM}
+if [ -z "${MALAYALAM}" ]; then
+    log "Fill the value of 'MALAYALAM' of db.sh"
+    exit 1
+fi
+
+workspace=$PWD
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "sub-stage 0: Download Data to downloads"
+
+    cd ${MALAYALAM}
+    wget https://www.openslr.org/resources/63/ml_in_female.zip
+    unzip -o ml_in_female.zip
+    rm -f ml_in_female.zip
+    wget https://www.openslr.org/resources/63/ml_in_male.zip
+    unzip -o ml_in_male.zip
+    rm -f ml_in_male.zip
+
+    wget https://www.openslr.org/resources/63/line_index_female.tsv
+    wget https://www.openslr.org/resources/63/line_index_male.tsv
+    cat line_index_female.tsv line_index_male.tsv > line_index_all.tsv
+    cd $workspace
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "sub-stage 1: Preparing Data for openslr"
+
+    python3 local/data_prep.py -d ${MALAYALAM}
+    utils/spk2utt_to_utt2spk.pl data/train_ml/spk2utt > data/train_ml/utt2spk
+    utils/spk2utt_to_utt2spk.pl data/dev_ml/spk2utt > data/dev_ml/utt2spk
+    utils/spk2utt_to_utt2spk.pl data/test_ml/spk2utt > data/test_ml/utt2spk
+    utils/fix_data_dir.sh data/train_ml
+    utils/fix_data_dir.sh data/dev_ml
+    utils/fix_data_dir.sh data/test_ml
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/ml_openslr63/asr1/local/data_prep.py b/egs2/ml_openslr63/asr1/local/data_prep.py
new file mode 100644
index 00000000000..bd174f75e68
--- /dev/null
+++ b/egs2/ml_openslr63/asr1/local/data_prep.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+
+# Referred from data_prep.py in jv_openslr35 in ESPnet
+# https://github.com/espnet/espnet/blob/master/egs2/jv_openslr35/
+# asr1/local/data_prep.py
+
+
+import argparse
+import os
+import random
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-d", help="downloads directory", type=str, default="downloads")
+    args = parser.parse_args()
+
+    tsv_path = "%s/line_index_all.tsv" % args.d
+
+    with open(tsv_path, "r", encoding="utf-8") as inf:
+        tsv_lines = inf.readlines()
+    tsv_lines = [line.strip() for line in tsv_lines]
+
+    spk2utt = {}
+    utt2text = {}
+    for line in tsv_lines:
+        l_list = line.split("\t")
+        fid = l_list[0]
+        spk = fid.split("_")[1]
+        text = l_list[1]
+        text = text.replace(".", "")
+        text = text.replace(",", "")
+        text = text.lower()
+        path = "%s/%s.wav" % (args.d, fid)
+        if os.path.exists(path):
+            utt2text[fid] = text
+            if spk in spk2utt:
+                spk2utt[spk].append(fid)
+            else:
+                spk2utt[spk] = [fid]
+
+    spks = sorted(list(spk2utt.keys()))
+    num_fids = 0
+    num_test_spks = 0
+    for spk in spks:
+        num_test_spks += 1
+        fids = sorted(list(set(spk2utt[spk])))
+        num_fids += len(fids)
+        if num_fids >= 1000:
+            break
+    test_spks = spks[:num_test_spks]
+    train_dev_spks = spks[num_test_spks:]
+    random.Random(0).shuffle(train_dev_spks)
+    num_train = int(len(train_dev_spks) * 0.9)
+    train_spks = train_dev_spks[:num_train]
+    dev_spks = train_dev_spks[num_train:]
+
+    spks_by_phase = {"train": train_spks, "dev": dev_spks, "test": test_spks}
+    flac_dir = "%s" % args.d
+    sr = 16000
+    for phase in spks_by_phase:
+        spks = spks_by_phase[phase]
+        text_strs = []
+        wav_scp_strs = []
+        spk2utt_strs = []
+        num_fids = 0
+        for spk in spks:
+            fids = sorted(list(set(spk2utt[spk])))
+            num_fids += len(fids)
+            if phase == "test" and num_fids > 1000:
+                curr_num_fids = num_fids - 1000
+                random.Random(1).shuffle(fids)
+                fids = fids[:curr_num_fids]
+            utts = [spk + "-" + f for f in fids]
+            utts_str = " ".join(utts)
+            spk2utt_strs.append("%s %s" % (spk, utts_str))
+            for fid, utt in zip(fids, utts):
+                cmd = "ffmpeg -i %s/%s.wav -f wav -ar %d -ab 16 -ac 1 - |" % (
+                    flac_dir,
+                    fid,
+                    sr,
+                )
+                text_strs.append("%s %s" % (utt, utt2text[fid]))
+                wav_scp_strs.append("%s %s" % (utt, cmd))
+        phase_dir = "data/%s_ml" % phase
+        if not os.path.exists(phase_dir):
+            os.makedirs(phase_dir)
+        text_strs = sorted(text_strs)
+        wav_scp_strs = sorted(wav_scp_strs)
+        spk2utt_strs = sorted(spk2utt_strs)
+        with open(os.path.join(phase_dir, "text"), "w+") as ouf:
+            for s in text_strs:
+                ouf.write("%s\n" % s)
+        with open(os.path.join(phase_dir, "wav.scp"), "w+") as ouf:
+            for s in wav_scp_strs:
+                ouf.write("%s\n" % s)
+        with open(os.path.join(phase_dir, "spk2utt"), "w+") as ouf:
+            for s in spk2utt_strs:
+                ouf.write("%s\n" % s)
diff --git a/egs2/ml_openslr63/asr1/local/path.sh b/egs2/ml_openslr63/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/ml_openslr63/asr1/path.sh b/egs2/ml_openslr63/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/ml_openslr63/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/ml_openslr63/asr1/pyscripts b/egs2/ml_openslr63/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/ml_openslr63/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/ml_openslr63/asr1/run.sh b/egs2/ml_openslr63/asr1/run.sh
new file mode 100644
index 00000000000..e085b5f0002
--- /dev/null
+++ b/egs2/ml_openslr63/asr1/run.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set="train_ml"
+train_dev="dev_ml"
+test_set="test_ml"
+
+asr_config=conf/train_asr.yaml
+inference_config=conf/decode_asr.yaml
+lm_config=conf/train_lm.yaml
+
+./asr.sh \
+    --ngpu 1 \
+    --lang "ml" \
+    --use_lm true \
+    --lm_config "${lm_config}" \
+    --token_type bpe \
+    --nbpe 150 \
+    --bpemode "unigram" \
+    --feats_type raw \
+    --speed_perturb_factors "0.9 1.0 1.1" \
+    --gpu_inference true \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${train_dev}" \
+    --test_sets "${train_dev} ${test_set}" \
+    --bpe_train_text "data/${train_set}/text" \
+    --lm_train_text "data/${train_set}/text"
diff --git a/egs2/ml_openslr63/asr1/scripts b/egs2/ml_openslr63/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/ml_openslr63/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/ml_openslr63/asr1/steps b/egs2/ml_openslr63/asr1/steps
new file mode 120000
index 00000000000..69ab7056139
--- /dev/null
+++ b/egs2/ml_openslr63/asr1/steps
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/steps
\ No newline at end of file
diff --git a/egs2/ml_openslr63/asr1/utils b/egs2/ml_openslr63/asr1/utils
new file mode 120000
index 00000000000..e18ae14b549
--- /dev/null
+++ b/egs2/ml_openslr63/asr1/utils
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/utils
\ No newline at end of file
diff --git a/egs2/mls/asr1/steps b/egs2/mls/asr1/steps
index 0c20af64f4a..91f2d234e20 120000
--- a/egs2/mls/asr1/steps
+++ b/egs2/mls/asr1/steps
@@ -1 +1 @@
-/home/jiatong/tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/mls/asr1/utils b/egs2/mls/asr1/utils
index c3ac429afa1..f49247da827 120000
--- a/egs2/mls/asr1/utils
+++ b/egs2/mls/asr1/utils
@@ -1 +1 @@
-/home/jiatong/tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/mr_openslr64/asr1/README.md b/egs2/mr_openslr64/asr1/README.md
new file mode 100644
index 00000000000..0e6848f9c27
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/README.md
@@ -0,0 +1,36 @@
+# RESULTS
+## Environments
+- date: `Mon Mar 21 16:06:03 UTC 2022`
+- python version: `3.9.7 (default, Sep 16 2021, 13:09:58)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.11.0+cu102`
+- Git hash: `91325a1e58ca0b13494b94bf79b186b095fe0b58`
+  - Commit date: `Mon Mar 21 00:40:52 2022 +0000`
+
+## asr_train_asr_conformer_xlsr_raw_bpe150_sp
+
+This recipe is for the Marathi language and is trained on the [OpenSLR Marathi](https://www.openslr.org/64/) multi-speaker speech data set.
+
+The following results are obtained by using an XLSR frontend.
+
+Train ASR Config: [conf/tuning/train_asr_conformer_xlsr.yaml](conf/tuning/train_asr_conformer_xlsr.yaml)
+
+Trained Model: [espnet/marathi_openslr64](https://huggingface.co/espnet/marathi_openslr64)
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_batch_size1_asr_model_valid.acc.ave/marathi_test|299|3625|72.9|22.5|4.7|1.7|28.9|88.6|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_batch_size1_asr_model_valid.acc.ave/marathi_test|299|20557|91.4|3.1|5.5|1.9|10.5|88.6|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_batch_size1_asr_model_valid.acc.ave/marathi_test|299|13562|86.5|6.3|7.1|1.4|14.9|88.6|
diff --git a/egs2/mr_openslr64/asr1/asr.sh b/egs2/mr_openslr64/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/mr_openslr64/asr1/cmd.sh b/egs2/mr_openslr64/asr1/cmd.sh
new file mode 120000
index 00000000000..f77e339f822
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/cmd.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/cmd.sh
\ No newline at end of file
diff --git a/egs2/mr_openslr64/asr1/conf/decode_asr.yaml b/egs2/mr_openslr64/asr1/conf/decode_asr.yaml
new file mode 120000
index 00000000000..f3f59d5ac2b
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/decode_asr.yaml
@@ -0,0 +1 @@
+tuning/decode_transformer.yaml
\ No newline at end of file
diff --git a/egs2/mr_openslr64/asr1/conf/fbank.conf b/egs2/mr_openslr64/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/mr_openslr64/asr1/conf/pbs.conf b/egs2/mr_openslr64/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/mr_openslr64/asr1/conf/pitch.conf b/egs2/mr_openslr64/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/mr_openslr64/asr1/conf/queue.conf b/egs2/mr_openslr64/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/mr_openslr64/asr1/conf/slurm.conf b/egs2/mr_openslr64/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/mr_openslr64/asr1/conf/train_asr.yaml b/egs2/mr_openslr64/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..10fea4f0971
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+tuning/train_asr_conformer.yaml
\ No newline at end of file
diff --git a/egs2/mr_openslr64/asr1/conf/train_lm.yaml b/egs2/mr_openslr64/asr1/conf/train_lm.yaml
new file mode 100644
index 00000000000..5bc7b7fcbfc
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/train_lm.yaml
@@ -0,0 +1,14 @@
+lm_conf:
+    nlayers: 2
+    unit: 650
+optim: sgd        # or adam
+batch_type: folded
+batch_size: 64   # batch size in LM training
+max_epoch: 20      # if the data size is large, we can reduce this
+patience: 3
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
diff --git a/egs2/mr_openslr64/asr1/conf/tuning/decode_rnn.yaml b/egs2/mr_openslr64/asr1/conf/tuning/decode_rnn.yaml
new file mode 100644
index 00000000000..169f05a1ad9
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/tuning/decode_rnn.yaml
@@ -0,0 +1,6 @@
+lm_weight: 0.3
+beam_size: 20
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.6
diff --git a/egs2/mr_openslr64/asr1/conf/tuning/decode_transformer.yaml b/egs2/mr_openslr64/asr1/conf/tuning/decode_transformer.yaml
new file mode 100644
index 00000000000..d8671a16988
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/tuning/decode_transformer.yaml
@@ -0,0 +1,7 @@
+batch_size: 16
+beam_size: 20
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.5
+lm_weight: 0.3
diff --git a/egs2/mr_openslr64/asr1/conf/tuning/train_asr_conformer.yaml b/egs2/mr_openslr64/asr1/conf/tuning/train_asr_conformer.yaml
new file mode 100644
index 00000000000..f47a0df534c
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/tuning/train_asr_conformer.yaml
@@ -0,0 +1,67 @@
+batch_type: numel
+batch_bins: 10000
+accum_grad: 3
+max_epoch: 60
+patience: none
+init: xavier_uniform
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 5
+
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 3
+    dropout_rate: 0.3
+    positional_dropout_rate: 0.3
+    attention_dropout_rate: 0.3
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: false
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 17
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 3
+    dropout_rate: 0.3
+    positional_dropout_rate: 0.3
+    self_attention_dropout_rate: 0.3
+    src_attention_dropout_rate: 0.3
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 20000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/mr_openslr64/asr1/conf/tuning/train_asr_conformer_xlsr.yaml b/egs2/mr_openslr64/asr1/conf/tuning/train_asr_conformer_xlsr.yaml
new file mode 100644
index 00000000000..1dbd14da380
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/tuning/train_asr_conformer_xlsr.yaml
@@ -0,0 +1,88 @@
+batch_type: numel
+batch_bins: 10000
+accum_grad: 3
+max_epoch: 60
+patience: none
+init: xavier_uniform
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 5
+
+freeze_param: [
+"frontend.upstream"
+]
+
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+frontend: s3prl
+frontend_conf:
+  frontend_conf:
+    upstream: wav2vec2_xlsr  # Note: If the upstream is changed, please change the input_size in the preencoder.
+  download_dir: ./hub
+  multilayer_feature: True
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 80
+
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 3
+    dropout_rate: 0.3
+    positional_dropout_rate: 0.3
+    attention_dropout_rate: 0.3
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: false
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 17
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 3
+    dropout_rate: 0.3
+    positional_dropout_rate: 0.3
+    self_attention_dropout_rate: 0.3
+    src_attention_dropout_rate: 0.3
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+optim: adam
+optim_conf:
+    lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 20000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/mr_openslr64/asr1/conf/tuning/train_asr_rnn.yaml b/egs2/mr_openslr64/asr1/conf/tuning/train_asr_rnn.yaml
new file mode 100644
index 00000000000..f55ad9d72bd
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/tuning/train_asr_rnn.yaml
@@ -0,0 +1,35 @@
+# network architecture
+# encoder related
+encoder: vgg_rnn
+encoder_conf:
+    rnn_type: lstm     # encoder architecture type
+    bidirectional: True
+    use_projection: True
+    num_layers: 4
+    hidden_size: 1024
+    output_size: 1024
+
+# decoder related
+decoder: rnn
+decoder_conf:
+    num_layers: 2
+    hidden_size: 1024
+    sampling_probability: 0
+    att_conf:
+        atype: location
+        adim: 1024
+        aconv_chans: 10
+        aconv_filts: 100
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.5
+
+# minibatch related
+batch_size: 30
+
+# optimization related
+optim: adadelta
+max_epoch: 15
+patience: 3
+
diff --git a/egs2/mr_openslr64/asr1/conf/tuning/train_asr_transformer.yaml b/egs2/mr_openslr64/asr1/conf/tuning/train_asr_transformer.yaml
new file mode 100644
index 00000000000..a0a37a5c0e4
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/tuning/train_asr_transformer.yaml
@@ -0,0 +1,68 @@
+# This configuration requires 4 GPUs with 32GB memory
+batch_type: numel
+batch_bins: 10000
+accum_grad: 3
+max_epoch: 60
+patience: none
+init: xavier_uniform
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 5
+
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 3
+    dropout_rate: 0.3
+    positional_dropout_rate: 0.3
+    attention_dropout_rate: 0.3
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: false
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 17
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 3
+    dropout_rate: 0.3
+    positional_dropout_rate: 0.3
+    self_attention_dropout_rate: 0.3
+    src_attention_dropout_rate: 0.3
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 20000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
\ No newline at end of file
diff --git a/egs2/mr_openslr64/asr1/db.sh b/egs2/mr_openslr64/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/mr_openslr64/asr1/local/data.sh b/egs2/mr_openslr64/asr1/local/data.sh
new file mode 100755
index 00000000000..c296b907d59
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/local/data.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+# general configuration
+stage=0       # start from 0 if you need to start from data preparation
+stop_stage=1
+# inclusive, was 100
+SECONDS=0
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+. utils/parse_options.sh
+
+log "data preparation started"
+
+if [ -z "$MARATHI" ]; then
+    log "Variable MARATHI not set in db.sh"
+    exit 2
+fi
+
+mkdir -p ${MARATHI}
+
+workspace=$PWD
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "sub-stage 0: Download Data to downloads"
+
+    cd ${MARATHI}
+    wget https://www.openslr.org/resources/64/mr_in_female.zip
+    unzip -o mr_in_female.zip
+    rm -f mr_in_female.zip
+    
+    cd $workspace    
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "sub-stage 1: Preparing Data for openslr"
+
+    python3 local/data_prep.py -d ${MARATHI}
+    utils/spk2utt_to_utt2spk.pl data/marathi_train/spk2utt > data/marathi_train/utt2spk
+    utils/spk2utt_to_utt2spk.pl data/marathi_dev/spk2utt > data/marathi_dev/utt2spk
+    utils/spk2utt_to_utt2spk.pl data/marathi_test/spk2utt > data/marathi_test/utt2spk
+    utils/fix_data_dir.sh data/marathi_train
+    utils/fix_data_dir.sh data/marathi_dev
+    utils/fix_data_dir.sh data/marathi_test
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/mr_openslr64/asr1/local/data_prep.py b/egs2/mr_openslr64/asr1/local/data_prep.py
new file mode 100644
index 00000000000..ed446ef71ae
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/local/data_prep.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+
+# Copyright 2021 Carnegie Mellon University (Peter Wu)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+
+import argparse
+import os
+import random
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-d", help="downloads directory", type=str, default="downloads")
+    args = parser.parse_args()
+
+    tsv_path = "%s/line_index.tsv" % args.d
+
+    with open(tsv_path, "r") as inf:
+        tsv_lines = inf.readlines()
+    tsv_lines = [line.strip() for line in tsv_lines]
+
+    spk2utt = {}
+    utt2text = {}
+    for line in tsv_lines:
+        l_list = line.split("\t")
+        fid = l_list[0]
+        spk = l_list[0].split("_")[1]
+        text = l_list[1]
+        path = "%s/%s.wav" % (args.d, fid)
+        if os.path.exists(path):
+            utt2text[fid] = text
+            if spk in spk2utt:
+                spk2utt[spk].append(fid)
+            else:
+                spk2utt[spk] = [fid]
+
+    spks = sorted(list(spk2utt.keys()))
+    num_fids = 0
+    num_test_spks = 0
+    for spk in spks:
+        num_test_spks += 1
+        fids = sorted(list(set(spk2utt[spk])))
+        num_fids += len(fids)
+        if num_fids >= 2000:
+            break
+
+    num_test_spks = 2
+    test_spks = spks[:num_test_spks]
+    train_dev_spks = spks[num_test_spks:]
+    random.Random(0).shuffle(train_dev_spks)
+    num_train = int(len(train_dev_spks) * 0.9)
+    train_spks = train_dev_spks[:num_train]
+    dev_spks = train_dev_spks[num_train:]
+
+    spks_by_phase = {"train": train_spks, "dev": dev_spks, "test": test_spks}
+    flac_dir = "%s" % args.d
+    sr = 16000
+    for phase in spks_by_phase:
+        spks = spks_by_phase[phase]
+        text_strs = []
+        wav_scp_strs = []
+        spk2utt_strs = []
+        num_fids = 0
+        for spk in spks:
+            fids = sorted(list(set(spk2utt[spk])))
+            num_fids += len(fids)
+            if phase == "test" and num_fids > 2000:
+                curr_num_fids = num_fids - 2000
+                random.Random(1).shuffle(fids)
+                fids = fids[:curr_num_fids]
+            utts = [spk + "-" + f for f in fids]
+            utts_str = " ".join(utts)
+            spk2utt_strs.append("%s %s" % (spk, utts_str))
+            for fid, utt in zip(fids, utts):
+                cmd = "ffmpeg -i %s/%s.wav -f wav -ar %d -ab 16 -ac 1 - |" % (
+                    flac_dir,
+                    fid,
+                    sr,
+                )
+                text_strs.append("%s %s" % (utt, utt2text[fid]))
+                wav_scp_strs.append("%s %s" % (utt, cmd))
+        phase_dir = "data/marathi_%s" % phase
+        if not os.path.exists(phase_dir):
+            os.makedirs(phase_dir)
+
+        text_strs = sorted(text_strs)
+        wav_scp_strs = sorted(wav_scp_strs)
+        spk2utt_strs = sorted(spk2utt_strs)
+
+        with open(os.path.join(phase_dir, "text"), "w+") as ouf:
+            for s in text_strs:
+                ouf.write("%s\n" % s)
+        with open(os.path.join(phase_dir, "wav.scp"), "w+") as ouf:
+            for s in wav_scp_strs:
+                ouf.write("%s\n" % s)
+        with open(os.path.join(phase_dir, "spk2utt"), "w+") as ouf:
+            for s in spk2utt_strs:
+                ouf.write("%s\n" % s)
diff --git a/egs2/mr_openslr64/asr1/local/path.sh b/egs2/mr_openslr64/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/mr_openslr64/asr1/path.sh b/egs2/mr_openslr64/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/mr_openslr64/asr1/pyscripts b/egs2/mr_openslr64/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/mr_openslr64/asr1/run.sh b/egs2/mr_openslr64/asr1/run.sh
new file mode 100755
index 00000000000..4b3fced2fb5
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/run.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+lid=false # whether to use language id as additional label
+
+train_set="marathi_train"
+train_dev="marathi_dev"
+test_set="marathi_test"
+
+asr_config=conf/tuning/train_asr_conformer_xlsr.yaml
+inference_config=conf/decode_asr.yaml
+
+ngpu=1
+
+./asr.sh \
+    --stage 1 \
+    --stop_stage 100 \
+    --ngpu ${ngpu} \
+    --nj 10 \
+    --inference_nj 10 \
+    --gpu_inference true \
+    --audio_format "wav" \
+    --inference_args "--batch_size 1" \
+    --use_lm false \
+    --token_type bpe \
+    --nbpe 150 \
+    --feats_type raw \
+    --feats_normalize utt_mvn \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${train_dev}" \
+    --test_sets "${test_set}" \
+    --speed_perturb_factors "0.9 1.0 1.1" \
+    --bpe_train_text "data/${train_set}/text" \
+    --lm_train_text "data/${train_set}/text" \
+    --local_score_opts "--score_lang_id ${lid}" "$@"
diff --git a/egs2/mr_openslr64/asr1/scripts b/egs2/mr_openslr64/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/mr_openslr64/asr1/steps b/egs2/mr_openslr64/asr1/steps
new file mode 120000
index 00000000000..69ab7056139
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/steps
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/steps
\ No newline at end of file
diff --git a/egs2/mr_openslr64/asr1/utils b/egs2/mr_openslr64/asr1/utils
new file mode 120000
index 00000000000..e18ae14b549
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/utils
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/utils
\ No newline at end of file
diff --git a/egs2/ms_indic_18/asr1/README.md b/egs2/ms_indic_18/asr1/README.md
new file mode 100644
index 00000000000..b88cf2fee85
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/README.md
@@ -0,0 +1,94 @@
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+
+# RESULTS
+## Environments
+- date: `Tue Mar 22 13:38:24 EDT 2022`
+- python version: `3.9.5 (default, Jun  4 2021, 12:28:51)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.8.1+cu111`
+- Git hash: `f91410f712d1287cd6809c5bf26b54c5a40fe314`
+  - Commit date: `Mon Mar 14 22:32:17 2022 -0400`
+- Pretrained model: [espnet/chai_microsoft_indian_langs_te](https://huggingface.co/espnet/chai_microsoft_indian_langs_te)
+
+## Self-supervised learning features [wav2vec2_xlsr, Conformer, utt_mvn](conf/tuning/train_asr_xlsr53_conformer.yaml) with [Transformer-LM](conf/tuning/train_lm_transformer.yaml) and [RNN-LM](conf/tuning/train_lm_rnn.yaml). During inference, all below models use the same [decoding parameters](conf/tuning/decode_asr_transformer.yaml).
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_te|3040|28413|78.0|19.5|2.5|2.4|24.4|80.1|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.best_asr_model_valid.acc.ave/test_te|3040|28413|78.0|19.4|2.6|2.4|24.4|79.7|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_te|3040|28413|78.0|19.5|2.6|2.5|24.5|79.9|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_te|3040|229419|95.6|2.2|2.2|1.6|6.1|80.1|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.best_asr_model_valid.acc.ave/test_te|3040|229419|95.6|2.2|2.2|1.6|6.0|79.7|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_te|3040|229419|95.6|2.1|2.2|1.6|6.0|79.9|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_te|3040|146657|92.7|4.7|2.6|1.6|8.9|80.1|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.best_asr_model_valid.acc.ave/test_te|3040|146657|92.8|4.7|2.6|1.6|8.9|79.7|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_te|3040|146657|92.8|4.6|2.6|1.6|8.9|79.9|
+
+
+
+## Self-supervised learning features [wav2vec2_large_ll60k, Transformer, utt_mvn](conf/tuning/train_asr_wav2vec2.yaml) with [Transformer-LM](conf/tuning/train_lm_transformer.yaml) and [RNN-LM](conf/tuning/train_lm_rnn.yaml)
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_te|3040|28413|77.3|20.3|2.4|2.9|25.6|79.5|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.best_asr_model_valid.acc.ave/test_te|3040|28413|77.4|20.1|2.5|2.8|25.3|79.3|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_te|3040|28413|77.5|20.1|2.4|2.8|25.3|79.6|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_te|3040|229419|95.0|2.5|2.5|1.8|6.8|79.5|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.best_asr_model_valid.acc.ave/test_te|3040|229419|95.1|2.4|2.5|1.8|6.7|79.3|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_te|3040|229419|95.1|2.4|2.5|1.7|6.6|79.6|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_te|3040|146657|91.8|5.2|3.0|1.8|9.9|79.5|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.best_asr_model_valid.acc.ave/test_te|3040|146657|91.9|5.1|2.9|1.8|9.8|79.3|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_te|3040|146657|92.0|5.1|3.0|1.7|9.7|79.6|
+
+
+
+## Standard ASR model based on [Transformer](conf/tuning/train_asr_transformer.yaml) with [Transformer-LM](conf/tuning/train_lm_transformer.yaml) and [RNN-LM](conf/tuning/train_lm_rnn.yaml)
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave_10best/test_te|3040|28413|75.7|22.0|2.4|3.2|27.6|82.7|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.best_asr_model_valid.acc.ave_10best/test_te|3040|28413|75.9|21.8|2.4|3.1|27.3|82.2|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave_10best/test_te|3040|28413|76.1|21.5|2.4|3.1|27.0|82.6|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave_10best/test_te|3040|229419|94.7|2.7|2.6|2.0|7.3|82.7|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.best_asr_model_valid.acc.ave_10best/test_te|3040|229419|94.8|2.7|2.6|2.0|7.2|82.2|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave_10best/test_te|3040|229419|94.8|2.6|2.6|2.0|7.1|82.6|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave_10best/test_te|3040|146657|91.1|5.8|3.1|2.0|10.9|82.7|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.best_asr_model_valid.acc.ave_10best/test_te|3040|146657|91.3|5.7|3.1|2.0|10.7|82.2|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave_10best/test_te|3040|146657|91.4|5.5|3.1|1.9|10.6|82.6|
+
diff --git a/egs2/ms_indic_18/asr1/asr.sh b/egs2/ms_indic_18/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/ms_indic_18/asr1/cmd.sh b/egs2/ms_indic_18/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/ms_indic_18/asr1/conf/decode_asr.yaml b/egs2/ms_indic_18/asr1/conf/decode_asr.yaml
new file mode 120000
index 00000000000..78955c67707
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/conf/decode_asr.yaml
@@ -0,0 +1 @@
+tuning/decode_asr_transformer.yaml
\ No newline at end of file
diff --git a/egs2/ms_indic_18/asr1/conf/fbank.conf b/egs2/ms_indic_18/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/ms_indic_18/asr1/conf/pbs.conf b/egs2/ms_indic_18/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/ms_indic_18/asr1/conf/pitch.conf b/egs2/ms_indic_18/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/ms_indic_18/asr1/conf/queue.conf b/egs2/ms_indic_18/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/ms_indic_18/asr1/conf/slurm.conf b/egs2/ms_indic_18/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/ms_indic_18/asr1/conf/train_asr.yaml b/egs2/ms_indic_18/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..00ea2f24791
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+tuning/train_asr_transformer.yaml
\ No newline at end of file
diff --git a/egs2/ms_indic_18/asr1/conf/train_lm.yaml b/egs2/ms_indic_18/asr1/conf/train_lm.yaml
new file mode 120000
index 00000000000..dd50b722db8
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/conf/train_lm.yaml
@@ -0,0 +1 @@
+tuning/train_lm_rnn.yaml
\ No newline at end of file
diff --git a/egs2/ms_indic_18/asr1/conf/tuning/decode_asr_transformer.yaml b/egs2/ms_indic_18/asr1/conf/tuning/decode_asr_transformer.yaml
new file mode 100644
index 00000000000..a3debc6a257
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/conf/tuning/decode_asr_transformer.yaml
@@ -0,0 +1,7 @@
+batch_size: 1
+beam_size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.4
+lm_weight: 0.4
diff --git a/egs2/ms_indic_18/asr1/conf/tuning/train_asr_transformer.yaml b/egs2/ms_indic_18/asr1/conf/tuning/train_asr_transformer.yaml
new file mode 100644
index 00000000000..454db5bfdbf
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/conf/tuning/train_asr_transformer.yaml
@@ -0,0 +1,55 @@
+cudnn_benchmark: false 
+cudnn_deterministic: false 
+use_amp: true 
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: "conv2d"
+    num_blocks: 6
+    linear_units: 1024
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+    attention_dropout_rate: 0.1
+
+# decoder related
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: "embed"
+    num_blocks: 6
+    linear_units: 1024
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# minibatch related
+batch_type: folded
+batch_size: 256
+
+# optimization related
+optim: adam
+accum_grad: 1
+grad_clip: 5
+patience: 3
+max_epoch: 50
+optim_conf:
+    lr: 0.001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 25000
+
+# others:
+best_model_criterion:
+-  - valid
+   - acc
+   - max
+keep_nbest_models: 10
+
+init: xavier_uniform # Our empirical studies shows that this initialization
+                     # is very important to low-resource ASR training
diff --git a/egs2/ms_indic_18/asr1/conf/tuning/train_asr_wav2vec2.yaml b/egs2/ms_indic_18/asr1/conf/tuning/train_asr_wav2vec2.yaml
new file mode 100644
index 00000000000..f61200b2ca5
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/conf/tuning/train_asr_wav2vec2.yaml
@@ -0,0 +1,93 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: "conv2d"
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+
+# decoder related
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: "embed"
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+#frontend related 
+#freeze_param: ["frontend.upstream"] 
+frontend: fused
+frontend_conf: 
+  frontends:
+    - frontend_type: default
+      n_fft: 512
+      win_length: 400
+      hop_length: 160
+
+    - frontend_type: s3prl
+      frontend_conf:
+        upstream: wav2vec2_large_ll60k
+      download_dir: ./hub
+      multilayer_feature: True
+
+  align_method: linear_projection
+  proj_dim: 100     
+
+preencoder: linear
+preencoder_conf:
+    input_size: 200  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 100
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.1
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+
+# minibatch related
+batch_type: folded
+batch_size: 64
+
+# optimization related
+optim: adam
+accum_grad: 1
+grad_clip: 5
+patience: 15
+max_epoch: 29
+optim_conf:
+    lr: 0.001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 20000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+# others:
+best_model_criterion:
+-  - valid
+   - acc
+   - max
+keep_nbest_models: 5
+
+init: xavier_uniform
diff --git a/egs2/ms_indic_18/asr1/conf/tuning/train_asr_xlsr53_conformer.yaml b/egs2/ms_indic_18/asr1/conf/tuning/train_asr_xlsr53_conformer.yaml
new file mode 100644
index 00000000000..509ca8fd7c2
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/conf/tuning/train_asr_xlsr53_conformer.yaml
@@ -0,0 +1,105 @@
+# network architecture
+encoder: conformer
+encoder_conf:
+  output_size: 256
+  attention_heads: 4
+  linear_units: 2048
+  num_blocks: 12
+  dropout_rate: 0.1
+  positional_dropout_rate: 0.1
+  attention_dropout_rate: 0.1
+  input_layer: conv2d
+  normalize_before: true
+  macaron_style: true
+  pos_enc_layer_type: "rel_pos"
+  selfattention_layer_type: "rel_selfattn"
+  activation_type: "swish"
+  use_cnn_module: true
+  cnn_module_kernel: 15
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: "embed"
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+#frontend related 
+freeze_param: [
+"frontend.upstream"
+]
+
+frontend: fused
+frontend_conf: 
+  frontends:
+    - frontend_type: default
+      n_fft: 512
+      win_length: 400
+      hop_length: 160
+
+    - frontend_type: s3prl
+      frontend_conf:
+        upstream: wav2vec2_xlsr
+      download_dir: ./hub
+      multilayer_feature: True
+
+  align_method: linear_projection
+  proj_dim: 200     
+
+preencoder: linear
+preencoder_conf:
+    input_size: 400  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 100
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+
+# minibatch related
+batch_type: folded
+batch_size: 64
+
+# optimization related
+optim: adam
+accum_grad: 1
+grad_clip: 5
+patience: 15
+max_epoch: 50
+optim_conf:
+    lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 30000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+# others:
+best_model_criterion:
+-  - valid
+   - acc
+   - max
+keep_nbest_models: 5
+
+init: xavier_uniform
diff --git a/egs2/ms_indic_18/asr1/conf/tuning/train_lm_rnn.yaml b/egs2/ms_indic_18/asr1/conf/tuning/train_lm_rnn.yaml
new file mode 100644
index 00000000000..a8cf1602296
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/conf/tuning/train_lm_rnn.yaml
@@ -0,0 +1,16 @@
+grad_clip: 5.0
+batch_type: folded
+batch_size: 512
+max_epoch: 30      # if the data size is large, we can reduce this
+optim: adam
+optim_conf:
+    lr: 0.0005
+lm: seq_rnn
+lm_conf:
+    unit: 650
+    nlayers: 4
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 3
diff --git a/egs2/ms_indic_18/asr1/conf/tuning/train_lm_transformer.yaml b/egs2/ms_indic_18/asr1/conf/tuning/train_lm_transformer.yaml
new file mode 100644
index 00000000000..3ce4f20d99d
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/conf/tuning/train_lm_transformer.yaml
@@ -0,0 +1,30 @@
+# Trained with Nvidia TESLA V100, with 16GM RAM, x4
+lm: transformer
+lm_conf:
+    pos_enc: null
+    embed_unit: 128
+    att_unit: 512
+    head: 8
+    unit: 2048
+    layer: 16
+    dropout_rate: 0.1
+
+# optimization related
+grad_clip: 5.0
+batch_type: numel
+batch_bins: 350000
+accum_grad: 2
+max_epoch: 25
+
+optim: adam
+optim_conf:
+   lr: 0.001
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 25000
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 3
diff --git a/egs2/ms_indic_18/asr1/db.sh b/egs2/ms_indic_18/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/ms_indic_18/asr1/local/data.sh b/egs2/ms_indic_18/asr1/local/data.sh
new file mode 100755
index 00000000000..9a9c709a206
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/local/data.sh
@@ -0,0 +1,54 @@
+#!/usr/bin/env bash
+
+# Copyright 2020 Johns Hopkins University (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+# general configuration
+stage=0       # start from 0 if you need to start from data preparation
+stop_stage=100
+SECONDS=0
+lang=te # te ta gu
+
+. utils/parse_options.sh || exit 1;
+
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+if [ -z "${MS_INDIC_IS18}" ]; then
+    log "Fill the value of 'MS_INDIC_IS18' of db.sh"
+    exit 1
+fi
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log "data preparation started"
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then 
+    if [[ ! -d "${MS_INDIC_IS18}/${lang}-in-Train" ]]; then
+        log "stage0: Download training data to ${MS_INDIC_IS18}. ${lang}-in-Train directory is missing"
+        exit 1
+    elif [[ ! -d "${MS_INDIC_IS18}/${lang}-in-Test" ]]; then
+        log "stage0: Download test data to ${MS_INDIC_IS18}. ${lang}-in-Test directory is missing"
+        exit 1
+    fi
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage1: Preparing data for Microsoft Speech Corpus (Indian languages)"
+    ### Task dependent. You have to make data the following preparation part by yourself.
+    local/prepare_data.py ${MS_INDIC_IS18} ${lang}
+fi
+
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/ms_indic_18/asr1/local/path.sh b/egs2/ms_indic_18/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/ms_indic_18/asr1/local/prepare_data.py b/egs2/ms_indic_18/asr1/local/prepare_data.py
new file mode 100755
index 00000000000..464a1f43b11
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/local/prepare_data.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+
+# Copyright 2021  Chaitanya Narisetty
+#           2021  Carnegie Mellon University
+# Apache 2.0
+
+
+import os
+import random
+import sys
+import librosa
+
+
+if len(sys.argv) != 3:
+    print("Usage: python prepare_data.py [data-directory] [language-ID]")
+    sys.exit(1)
+
+datadir = sys.argv[1]
+lang = sys.argv[2]
+
+traindir = f"{datadir}/{lang}-in-Train/"
+testdir = f"{datadir}/{lang}-in-Test/"
+
+train_datadir = f"data/train_{lang}/"
+valid_datadir = f"data/dev_{lang}/"
+test_datadir = f"data/test_{lang}/"
+
+os.popen(f"mkdir -p {train_datadir}").read()
+os.popen(f"mkdir -p {valid_datadir}").read()
+os.popen(f"mkdir -p {test_datadir}").read()
+
+
+# prepare data for training and validation splits
+with open(traindir + "transcription.txt") as f:
+    train_lines = [line.rstrip() for line in f.readlines()]
+    train_id2text = {}
+    train_id2filepath = {}
+    for line in train_lines:
+        wav_id = line.split()[0]
+        filepath = f"{traindir}/Audios/{wav_id}.wav"
+        train_id2text[wav_id] = " ".join(line.split()[1:])
+        train_id2filepath[wav_id] = filepath
+
+wav_ids = list(train_id2text.keys())
+random.shuffle(wav_ids)
+valid_id2text = {}
+valid_totaldur = 2 * 60 * 60  # (in seconds) 2 hours taken for validation split
+for wav_id in wav_ids:
+    dur = librosa.get_duration(filename=train_id2filepath[wav_id])
+    valid_id2text[wav_id] = train_id2text.pop(wav_id)
+    valid_totaldur -= dur
+    if valid_totaldur < 0:
+        break
+
+
+with open(train_datadir + "text", "w") as f:
+    for wav_id in sorted(train_id2text):
+        f.write(f"{lang}_{wav_id} {train_id2text[wav_id]}\n")
+with open(train_datadir + "wav.scp", "w") as f:
+    for wav_id in sorted(train_id2text):
+        f.write(f"{lang}_{wav_id} {train_id2filepath[wav_id]}\n")
+with open(train_datadir + "spk2utt", "w") as f:
+    for wav_id in sorted(train_id2text):
+        f.write(f"spk_{lang}_{wav_id} {lang}_{wav_id}\n")
+with open(train_datadir + "utt2spk", "w") as f:
+    for wav_id in sorted(train_id2text):
+        f.write(f"{lang}_{wav_id} spk_{lang}_{wav_id}\n")
+
+with open(valid_datadir + "text", "w") as f:
+    for wav_id in sorted(valid_id2text):
+        f.write(f"{lang}_{wav_id} {valid_id2text[wav_id]}\n")
+with open(valid_datadir + "wav.scp", "w") as f:
+    for wav_id in sorted(valid_id2text):
+        f.write(f"{lang}_{wav_id} {train_id2filepath[wav_id]}\n")
+with open(valid_datadir + "spk2utt", "w") as f:
+    for wav_id in sorted(valid_id2text):
+        f.write(f"spk_{lang}_{wav_id} {lang}_{wav_id}\n")
+with open(valid_datadir + "utt2spk", "w") as f:
+    for wav_id in sorted(valid_id2text):
+        f.write(f"{lang}_{wav_id} spk_{lang}_{wav_id}\n")
+
+
+# prepare test data
+with open(testdir + "transcription.txt") as f:
+    test_lines = [line.rstrip() for line in f.readlines()]
+    test_id2text = {}
+    test_id2filepath = {}
+    for line in test_lines:
+        wav_id = line.split()[0]
+        filepath = f"{testdir}/Audios/{wav_id}.wav"
+        test_id2text[wav_id] = " ".join(line.split()[1:])
+        test_id2filepath[wav_id] = filepath
+
+with open(test_datadir + "text", "w") as f:
+    for wav_id in sorted(test_id2text):
+        f.write(f"{lang}_{wav_id} {test_id2text[wav_id]}\n")
+with open(test_datadir + "wav.scp", "w") as f:
+    for wav_id in sorted(test_id2text):
+        f.write(f"{lang}_{wav_id} {test_id2filepath[wav_id]}\n")
+with open(test_datadir + "spk2utt", "w") as f:
+    for wav_id in sorted(test_id2text):
+        f.write(f"spk_{lang}_{wav_id} {lang}_{wav_id}\n")
+with open(test_datadir + "utt2spk", "w") as f:
+    for wav_id in sorted(test_id2text):
+        f.write(f"{lang}_{wav_id} spk_{lang}_{wav_id}\n")
diff --git a/egs2/ms_indic_18/asr1/path.sh b/egs2/ms_indic_18/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/ms_indic_18/asr1/pyscripts b/egs2/ms_indic_18/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/ms_indic_18/asr1/run.sh b/egs2/ms_indic_18/asr1/run.sh
new file mode 100755
index 00000000000..e2a8c317a51
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/run.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+lang=te # te ta gu
+
+train_set=train_${lang}
+train_dev=dev_${lang}
+test_set="${train_dev} test_${lang}"
+
+asr_config=conf/train_asr.yaml
+lm_config=conf/train_lm.yaml
+inference_config=conf/decoder_asr.yaml
+
+if [[ "zh" == *"${lang}"* ]]; then # placeholder for optimal bpe when lang=te
+  nbpe=2500
+elif [[ "fr" == *"${lang}"* ]]; then # placeholder for optimal bpe when lang=ta
+  nbpe=350
+elif [[ "es" == *"${lang}"* ]]; then # placeholder for optimal bpe when lang=gu
+  nbpe=235
+else
+  nbpe=150
+fi
+
+
+./asr.sh \
+    --ngpu 1 \
+    --lang "${lang}" \
+    --local_data_opts "--lang ${lang}" \
+    --use_lm true \
+    --lm_config "${lm_config}" \
+    --token_type bpe \
+    --nbpe $nbpe \
+    --feats_type raw \
+    --speed_perturb_factors "0.9 1.0 1.1" \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --inference_asr_model valid.acc.ave.pth\
+    --train_set "${train_set}" \
+    --valid_set "${train_dev}" \
+    --test_sets "${test_set}" \
+    --min_wav_duration 0.5 \
+    --bpe_train_text "data/${train_set}/text" \
+    --lm_train_text "data/${train_set}/text" "$@"
+
diff --git a/egs2/ms_indic_18/asr1/scripts b/egs2/ms_indic_18/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/ms_indic_18/asr1/steps b/egs2/ms_indic_18/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/ms_indic_18/asr1/utils b/egs2/ms_indic_18/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/puebla_nahuatl/st1/cmd.sh b/egs2/puebla_nahuatl/st1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/puebla_nahuatl/st1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/puebla_nahuatl/st1/conf/decode_st.yaml b/egs2/puebla_nahuatl/st1/conf/decode_st.yaml
new file mode 100644
index 00000000000..2967ee6fc0f
--- /dev/null
+++ b/egs2/puebla_nahuatl/st1/conf/decode_st.yaml
@@ -0,0 +1,5 @@
+batch_size: 1
+beam_size: 10
+nbest: 1
+lm_weight: 0.0
+
diff --git a/egs2/puebla_nahuatl/st1/conf/fbank.conf b/egs2/puebla_nahuatl/st1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/puebla_nahuatl/st1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/puebla_nahuatl/st1/conf/pbs.conf b/egs2/puebla_nahuatl/st1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/puebla_nahuatl/st1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/puebla_nahuatl/st1/conf/pitch.conf b/egs2/puebla_nahuatl/st1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/puebla_nahuatl/st1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/puebla_nahuatl/st1/conf/queue.conf b/egs2/puebla_nahuatl/st1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/puebla_nahuatl/st1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/puebla_nahuatl/st1/conf/slurm.conf b/egs2/puebla_nahuatl/st1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/puebla_nahuatl/st1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/puebla_nahuatl/st1/conf/train_conformer_st.yaml b/egs2/puebla_nahuatl/st1/conf/train_conformer_st.yaml
new file mode 100644
index 00000000000..36496be8206
--- /dev/null
+++ b/egs2/puebla_nahuatl/st1/conf/train_conformer_st.yaml
@@ -0,0 +1,95 @@
+# network architecture
+
+# frontend related
+frontend: default
+frontend_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 160
+
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: embed
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+extra_asr_decoder: transformer
+extra_asr_decoder_conf:
+    input_layer: embed
+    num_blocks: 2
+    linear_units: 2048
+    dropout_rate: 0.1
+
+extra_mt_decoder: transformer
+extra_mt_decoder_conf:
+    input_layer: embed
+    num_blocks: 2
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# loss related
+model_conf:
+    asr_weight: 0.3
+    mt_weight: 0.0
+    mtlalpha: 1.0
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# optimization related
+optim: adam
+accum_grad: 2
+grad_clip: 3
+max_epoch: 50
+optim_conf:
+    lr: 2.5
+scheduler: noamlr
+scheduler_conf:
+    model_size: 256
+    warmup_steps: 25000
+
+# minibatch related
+batch_type: folded
+batch_bins: 32
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/puebla_nahuatl/st1/conf/train_st.yaml b/egs2/puebla_nahuatl/st1/conf/train_st.yaml
new file mode 100644
index 00000000000..9fcb404db15
--- /dev/null
+++ b/egs2/puebla_nahuatl/st1/conf/train_st.yaml
@@ -0,0 +1,87 @@
+# network architecture
+
+# frontend related
+frontend: default
+frontend_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 160
+
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: conv2d
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: embed
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+extra_asr_decoder: transformer
+extra_asr_decoder_conf:
+    input_layer: embed
+    num_blocks: 2
+    linear_units: 2048
+    dropout_rate: 0.1
+
+extra_mt_decoder: transformer
+extra_mt_decoder_conf:
+    input_layer: embed
+    num_blocks: 2
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# loss related
+model_conf:
+    asr_weight: 0.3
+    mt_weight: 0.0
+    mtlalpha: 1.0
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# optimization related
+optim: adam
+accum_grad: 2
+grad_clip: 3
+max_epoch: 50
+optim_conf:
+    lr: 2.5
+scheduler: noamlr
+scheduler_conf:
+    model_size: 256
+    warmup_steps: 25000
+
+# minibatch related
+batch_type: folded
+batch_size: 64
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/puebla_nahuatl/st1/db.sh b/egs2/puebla_nahuatl/st1/db.sh
new file mode 120000
index 00000000000..39ab78266e5
--- /dev/null
+++ b/egs2/puebla_nahuatl/st1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/st1/db.sh
\ No newline at end of file
diff --git a/egs2/puebla_nahuatl/st1/local/data.sh b/egs2/puebla_nahuatl/st1/local/data.sh
new file mode 100755
index 00000000000..9652927d4e6
--- /dev/null
+++ b/egs2/puebla_nahuatl/st1/local/data.sh
@@ -0,0 +1,67 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Carnegie Mellon University (Jiatong Shi)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+# general configuration
+stage=0       # start from 0 if you need to start from data preparation
+stop_stage=100
+SECONDS=0
+
+
+ . utils/parse_options.sh || exit 1;
+
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+mkdir -p ${PUEBLA_NAHUATL}
+if [ -z "${PUEBLA_NAHUATL}" ]; then
+    log "Fill the value of 'PUEBLA_NAHUATL' of db.sh"
+    exit 1
+fi
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+# dataset related
+wavdir=${PUEBLA_NAHUATL}/Sound-files-Puebla-Nahuatl
+annotation_dir=${PUEBLA_NAHUATL}/SpeechTranslation210217
+annotation_type=eaf
+annotation_id=st
+src_lang=na
+tgt_lang=es
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "stage 0: Data downloading"
+    # Download the Data
+    # local/download_and_untar.sh local  https://www.openslr.org/resources/92/Puebla-Nahuatl-Manifest.tgz Puebla-Nahuatl-Manifest.tgz
+    # local/download_and_untar.sh ${PUEBLA_NAHUATL} https://www.openslr.org/resources/92/Sound-Files-Puebla-Nahuatl.tgz.part0 Sound-Files-Puebla-Nahuatl.tgz.part0 9
+    # local/download_and_untar.sh ${PUEBLA_NAHUATL} https://www.openslr.org/resources/92/SpeechTranslation_Nahuatl_Manifest.tgz SpeechTranslation_Nahuatl_Manifest.tgz
+    # git clone https://github.com/ftshijt/Puebla_Nahuatl_Split.git local/split
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: Data preparation"
+    mkdir -p remixed
+    for x in train dev test; do
+        python local/data_prep.py -w $wavdir -t data/${x}_${annotation_id} -m ${annotation_type} -i local/split/speaker_wav_mapping_nahuatl_${x}.csv -a ${annotation_dir} -d local/split/Puebla-Nahuat-and-Totonac-consultants_for-LDC-archive.xml
+        cp data/${x}_${annotation_id}/text.${src_lang} data/${x}_${annotation_id}/text.lc.rm.${src_lang}
+        cp data/${x}_${annotation_id}/text.${tgt_lang} data/${x}_${annotation_id}/text.lc.rm.${tgt_lang}
+        ln -sf data/${x}_${annotation_id}/text.lc.rm.${tgt_lang} data/${x}_${annotation_id}/text
+        utils/fix_data_dir.sh --utt_extra_files "text.${src_lang} text.${tgt_lang} text.lc.rm.${src_lang} text.lc.rm.${tgt_lang}" data/${x}_${annotation_id}
+        # shellcheck disable=SC1090
+        . ./data/${x}_st/remix_script.sh
+    sort -o data/${x}_${annotation_id}/text.lc.rm.${tgt_lang} data/${x}_${annotation_id}/text.lc.rm.${tgt_lang}
+    sort -o data/${x}_${annotation_id}/text.lc.rm.${src_lang} data/${x}_${annotation_id}/text.lc.rm.${src_lang} 
+    done
+fi
diff --git a/egs2/puebla_nahuatl/st1/local/data_prep.py b/egs2/puebla_nahuatl/st1/local/data_prep.py
new file mode 120000
index 00000000000..07e9b6d3a70
--- /dev/null
+++ b/egs2/puebla_nahuatl/st1/local/data_prep.py
@@ -0,0 +1 @@
+../../../../egs/puebla_nahuatl/st1/local/data_prep.py
\ No newline at end of file
diff --git a/egs2/puebla_nahuatl/st1/local/download_and_untar.sh b/egs2/puebla_nahuatl/st1/local/download_and_untar.sh
new file mode 120000
index 00000000000..694b903f7be
--- /dev/null
+++ b/egs2/puebla_nahuatl/st1/local/download_and_untar.sh
@@ -0,0 +1 @@
+../../../../egs/puebla_nahuatl/st1/local/download_and_untar.sh
\ No newline at end of file
diff --git a/egs2/puebla_nahuatl/st1/local/path.sh b/egs2/puebla_nahuatl/st1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/puebla_nahuatl/st1/path.sh b/egs2/puebla_nahuatl/st1/path.sh
new file mode 120000
index 00000000000..8e43dca7d4d
--- /dev/null
+++ b/egs2/puebla_nahuatl/st1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/st1/path.sh
\ No newline at end of file
diff --git a/egs2/puebla_nahuatl/st1/pyscripts b/egs2/puebla_nahuatl/st1/pyscripts
new file mode 120000
index 00000000000..f8eb6b2ab7b
--- /dev/null
+++ b/egs2/puebla_nahuatl/st1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/st1/pyscripts
\ No newline at end of file
diff --git a/egs2/puebla_nahuatl/st1/run.sh b/egs2/puebla_nahuatl/st1/run.sh
new file mode 100755
index 00000000000..d8705f5e6f9
--- /dev/null
+++ b/egs2/puebla_nahuatl/st1/run.sh
@@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+src_lang=na
+tgt_lang=es
+
+train_set=train_st
+train_dev=dev_st
+test_set="dev_st test_st"
+
+st_config=conf/train_st.yaml
+inference_config=conf/decode_st.yaml
+
+src_nbpe=500
+tgt_nbpe=500
+
+src_case=lc.rm
+tgt_case=lc.rm
+
+./st.sh \
+    --local_data_opts "--stage 0" \
+    --stage 11 \
+    --stop_stage 11 \
+    --audio_format "flac.ark" \
+    --use_lm false \
+    --token_joint false \
+    --st_tag "asr_pretrained" \
+    --nj 40 \
+    --inference_nj 4 \
+    --src_lang ${src_lang} \
+    --tgt_lang ${tgt_lang} \
+    --src_token_type "bpe" \
+    --src_nbpe $src_nbpe \
+    --tgt_token_type "bpe" \
+    --tgt_nbpe $tgt_nbpe \
+    --src_case ${src_case} \
+    --tgt_case ${tgt_case} \
+    --feats_type raw \
+    --speed_perturb_factors "0.9 1.0 1.1" \
+    --pretrained_asr "/projects/tir3/users/jiatongs/els/puebla_nahuatl/asr1/exp/asr_train_asr_transformer_specaug_raw_bpe500_sp/valid.acc.ave_10best.pth" \
+    --ignore_init_mismatch true \
+    --st_config "${st_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${train_dev}" \
+    --test_sets "${test_set}" \
+    --src_bpe_train_text "data/${train_set}/text.${src_case}.${src_lang}" \
+    --tgt_bpe_train_text "data/${train_set}/text.${tgt_case}.${tgt_lang}" \
+    --lm_train_text "data/${train_set}/text.${tgt_case}.${tgt_lang}"  "$@"
diff --git a/egs2/puebla_nahuatl/st1/scripts b/egs2/puebla_nahuatl/st1/scripts
new file mode 120000
index 00000000000..c5b75bb5b0a
--- /dev/null
+++ b/egs2/puebla_nahuatl/st1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/st1/scripts
\ No newline at end of file
diff --git a/egs2/puebla_nahuatl/st1/st.sh b/egs2/puebla_nahuatl/st1/st.sh
new file mode 120000
index 00000000000..5c7465739e3
--- /dev/null
+++ b/egs2/puebla_nahuatl/st1/st.sh
@@ -0,0 +1 @@
+../../TEMPLATE/st1/st.sh
\ No newline at end of file
diff --git a/egs2/puebla_nahuatl/st1/steps b/egs2/puebla_nahuatl/st1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/puebla_nahuatl/st1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/puebla_nahuatl/st1/utils b/egs2/puebla_nahuatl/st1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/puebla_nahuatl/st1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/seame/asr1/README.md b/egs2/seame/asr1/README.md
new file mode 100644
index 00000000000..489ae51208a
--- /dev/null
+++ b/egs2/seame/asr1/README.md
@@ -0,0 +1,32 @@
+# Conformer + specaug + speed perturbation
+## Environments
+ - date: `Thu Jan 13 17:16:42 CST 2022`
+ - python version: `3.8.12 (default, Oct 12 2021, 13:49:34)  [GCC 7.5.0]`
+ - espnet version: `espnet 0.10.6a1`
+ - pytorch version: `pytorch 1.8.1+cu111`
+ - Git hash: `cddeeef1933ce4c1552e9d2e1af5bb3c60ad74f4`
+   - Commit date: `Fri Dec 31 23:16:25 2021 +0900`
+
+## With Transformer LM
+ - Model link: [zenodo](https://zenodo.org/record/5845307) / [huggingface](https://huggingface.co/espnet/vectominist_seame_asr_conformer_bpe5626)
+ - ASR config: [./conf/tuning/train_asr_conformer.yaml](./conf/tuning/train_asr_conformer.yaml)
+ - LM config: [./conf/tuning/train_lm_transformer.yaml](./conf/tuning/train_lm_transformer.yaml)
+ 
+### WER
+ Mixed Mandarin CER / English WER
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+ |---|---|---|---|---|---|---|---|---|
+ |decode_lm0.2_ctc0.4_beam10/devman|6531|96737|85.3|11.4|3.3|1.9|16.6|75.5|
+ |decode_lm0.2_ctc0.4_beam10/devsge|5321|54390|79.5|16.2|4.4|2.8|23.3|74.3|
+
+ Mandarin CER
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+ |---|---|---|---|---|---|---|---|---|
+ |decode_lm0.2_ctc0.4_beam10/devman|6531|71806|88.2|7.6|4.3|3.2|15.0|59.4|
+ |decode_lm0.2_ctc0.4_beam10/devsge|5321|20327|84.9|9.1|6.0|6.9|22.0|34.5|
+
+ English WER
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+ |---|---|---|---|---|---|---|---|---|
+ |decode_lm0.2_ctc0.4_beam10/devman|6531|24931|76.9|14.4|8.7|6.1|29.2|52.6|
+ |decode_lm0.2_ctc0.4_beam10/devsge|5321|34063|76.2|16.2|7.5|4.5|28.2|66.2|
\ No newline at end of file
diff --git a/egs2/seame/asr1/asr.sh b/egs2/seame/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/seame/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/seame/asr1/cmd.sh b/egs2/seame/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/seame/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/seame/asr1/conf/decode_asr.yaml b/egs2/seame/asr1/conf/decode_asr.yaml
new file mode 100644
index 00000000000..c99b0378d69
--- /dev/null
+++ b/egs2/seame/asr1/conf/decode_asr.yaml
@@ -0,0 +1,3 @@
+lm_weight: 0.2
+ctc_weight: 0.4
+beam_size: 10
\ No newline at end of file
diff --git a/egs2/seame/asr1/conf/fbank.conf b/egs2/seame/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/seame/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/seame/asr1/conf/pbs.conf b/egs2/seame/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/seame/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/seame/asr1/conf/pitch.conf b/egs2/seame/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/seame/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/seame/asr1/conf/queue.conf b/egs2/seame/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/seame/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/seame/asr1/conf/slurm.conf b/egs2/seame/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/seame/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/seame/asr1/conf/train.yaml b/egs2/seame/asr1/conf/train.yaml
new file mode 120000
index 00000000000..10fea4f0971
--- /dev/null
+++ b/egs2/seame/asr1/conf/train.yaml
@@ -0,0 +1 @@
+tuning/train_asr_conformer.yaml
\ No newline at end of file
diff --git a/egs2/seame/asr1/conf/tuning/train_asr_conformer.yaml b/egs2/seame/asr1/conf/tuning/train_asr_conformer.yaml
new file mode 100644
index 00000000000..b9cfe8e6aee
--- /dev/null
+++ b/egs2/seame/asr1/conf/tuning/train_asr_conformer.yaml
@@ -0,0 +1,70 @@
+# This configuration requires 2 GPUs with 32GB memory and 2 days for training
+batch_type: numel
+batch_bins: 15000000
+
+accum_grad: 2
+grad_clip: 5
+max_epoch: 120
+patience: none
+init: xavier_uniform
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+encoder: conformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d
+    normalize_before: true
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    macaron_style: true
+    use_cnn_module: true
+    cnn_module_kernel: 15
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 25000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
\ No newline at end of file
diff --git a/egs2/seame/asr1/conf/tuning/train_lm_transformer.yaml b/egs2/seame/asr1/conf/tuning/train_lm_transformer.yaml
new file mode 100644
index 00000000000..6b00db4f608
--- /dev/null
+++ b/egs2/seame/asr1/conf/tuning/train_lm_transformer.yaml
@@ -0,0 +1,28 @@
+lm: transformer
+lm_conf:
+    pos_enc: null
+    embed_unit: 128
+    att_unit: 512
+    head: 8
+    unit: 2048
+    layer: 16
+    dropout_rate: 0.1
+
+grad_clip: 5.0
+batch_type: numel
+batch_bins: 1000000
+accum_grad: 2
+max_epoch: 15
+
+optim: adam
+optim_conf:
+   lr: 0.0002
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 25000
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 10
\ No newline at end of file
diff --git a/egs2/seame/asr1/db.sh b/egs2/seame/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/seame/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/seame/asr1/local/data.sh b/egs2/seame/asr1/local/data.sh
new file mode 100755
index 00000000000..27fc9aa1a2f
--- /dev/null
+++ b/egs2/seame/asr1/local/data.sh
@@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+
+stage=1
+stop_stage=100000
+dev_repo_dir=data/SEAME-dev-set
+
+log "$0 $*"
+. utils/parse_options.sh
+
+. ./db.sh
+. ./path.sh
+. ./cmd.sh
+
+if [ -z "${SEAME}" ]; then
+    log "Fill the value of 'SEAME' of db.sh"
+    exit 1
+fi
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ] && [ ! -d "${dev_repo_dir}" ]; then
+    log "stage 1: Clone official SEAME repository"
+    
+    git clone https://github.com/zengzp0912/SEAME-dev-set.git ${dev_repo_dir}
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: Data Preparation"
+    
+    local/preprocess.py --out data --data ${SEAME} --repo ${dev_repo_dir}
+    
+    for set in train valid devman devsge
+    do
+        cp data/${set}/text.rm.noise data/${set}/text
+        utils/utt2spk_to_spk2utt.pl data/${set}/utt2spk > data/${set}/spk2utt
+        utils/validate_data_dir.sh --no-feats data/${set} || exit 1
+    done
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/seame/asr1/local/path.sh b/egs2/seame/asr1/local/path.sh
new file mode 100755
index 00000000000..080cdb9969f
--- /dev/null
+++ b/egs2/seame/asr1/local/path.sh
@@ -0,0 +1,5 @@
+if [ ! which flac &> /dev/null ]
+then
+    echo "Error: flac is not installed"
+    return 1
+fi
\ No newline at end of file
diff --git a/egs2/seame/asr1/local/preprocess.py b/egs2/seame/asr1/local/preprocess.py
new file mode 100755
index 00000000000..eb0ccfac47b
--- /dev/null
+++ b/egs2/seame/asr1/local/preprocess.py
@@ -0,0 +1,644 @@
+#!/usr/bin/env python3
+# -*- encoding: utf8 -*-
+
+"""
+    This is an python implementation of preprocessing of
+    the SEAME Mandarin-English code-switching corpus.
+    We follow original papers [1, 2] and the official
+    github repository [3] to make this code produces the
+    same amount of training and testing data.
+
+    [1] Dau-Cheng Lyu, Tien-Ping Tan, Eng-Siong Chng, and
+        Haizhou Li, "SEAME: a Mandarin-English Code-switching
+        Speech Corpus in South-East Asia," in Interspeech, 2010.
+    [2] Zhiping Zeng, Yerbolat Khassanov, Van Tung Pham, Haihua
+        Xu, Eng Siong Chng, and Haizhou Li, "On the End-to-End
+        Solution to Mandarin-English Code-switching Speech
+        Recognition," in Interspeech, 2019.
+    [3] https://github.com/zengzp0912/SEAME-dev-set
+"""
+
+import re
+import os
+import sys
+import argparse
+import itertools
+import collections
+import random as rd
+
+rd.seed(531)
+
+remove_punc = '()[]{}.,?·@，。、「」＃"~-—#%_`｀×*（）［］&【】～ｌ\\'
+pattern = str.maketrans(remove_punc, " " * len(remove_punc))
+
+translate_char_source = "ａｂｃｄｅｆｇｈｉｊｋｌｍｎｏｐｑｒｓｔｕｖｗｘｙｚＡＢＣＤＥＦＧＨＩＪＫＬＭＮＯＰＱＲＳＴＵＶＷＸＹＺé"
+translate_char_target = "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyze"
+pattern2 = str.maketrans(translate_char_source, translate_char_target)
+
+all_chars = (chr(i) for i in range(sys.maxunicode))
+categories = {"Cc"}
+control_chars = "".join(map(chr, itertools.chain(range(0x00, 0x20), range(0x7F, 0xA0))))
+control_char_re = re.compile("[%s]" % re.escape(control_chars))
+
+
+def remove_control_chars(text):
+    """remove unprintable characters"""
+    return control_char_re.sub("", text)
+
+
+def remove_redundant_whitespaces(text):
+    """remove redundant whitespaces"""
+    return re.sub(" +", " ", text).strip()
+
+
+def is_english(c):
+    """check character is in English"""
+    return ord(c.lower()) >= ord("a") and ord(c.lower()) <= ord("z")
+
+
+def is_mandarin(c):
+    """check character is Mandarin"""
+    return (
+        not is_english(c)
+        and not c.isdigit()
+        and c != " "
+        and c != "<"
+        and c != ">"
+        and c != "'"
+    )
+
+
+def extract_mandarin_only(text):
+    """remove other symbols except for Mandarin characters in a string"""
+    return "".join([c for c in text if is_mandarin(c)])
+
+
+def extract_non_mandarin(text):
+    """remove Mandarin characters in a string"""
+    return " ".join([w for w in text.split(" ") if not any(is_mandarin(c) for c in w)])
+
+
+def insert_space_between_mandarin(text):
+    """insert space between Mandarin characters"""
+
+    if len(text) <= 1:
+        return text
+    out_text = text[0]
+    for i in range(1, len(text)):
+        if is_mandarin(text[i]):
+            out_text += " "
+        out_text += text[i]
+        if is_mandarin(text[i]):
+            out_text += " "
+
+    return out_text
+
+
+def remove_repeated_noise(text, pattern="<noise>"):
+    """remove repeated <noise>"""
+
+    if len(re.findall(pattern, text)) <= 1:
+        return text
+
+    out_text = ""
+    text_split = text.split()
+    out_text = [text_split[0]]
+    for i in range(1, len(text_split)):
+        if text_split[i] == pattern and text_split[i - 1] == pattern:
+            continue
+        else:
+            out_text.append(text_split[i])
+
+    return " ".join(out_text)
+
+
+def normalize_text(text):
+    """normalize a text sequence"""
+
+    rmtext = re.sub(
+        r"\(((pp)(\w)+)\)",
+        "<noise>",
+        text.lower(),
+    )
+    rmtext = re.sub(
+        r"\<((pp)(\w)+)\>",
+        "<noise>",
+        rmtext,
+    )
+    rmtext = rmtext.translate(pattern)
+    rmtext = remove_control_chars(rmtext)
+    output_text = ""
+    for wrd in rmtext.split():
+        if wrd in {
+            "ppl",
+            "ppc",
+            "ppb",
+            "ppo",
+            "<v-noise>",
+        }:
+            wrd = "<noise>"
+        output_text += f"{wrd} "
+
+    output_text = output_text.strip()
+    output_text = output_text.translate(pattern2)
+    output_text = output_text.replace("<unl>", "<unk>")
+    output_text = output_text.replace("< unk >", "<unk>")
+    output_text = re.sub(r"\<((unk)[a-z ]+)\>", "<unk>", output_text)
+    output_text = insert_space_between_mandarin(output_text)
+    output_text = remove_redundant_whitespaces(output_text)
+    output_text = remove_repeated_noise(output_text, "<noise>")
+
+    return output_text
+
+
+def read_list(pth):
+    """read data list (data/SEAME-dev-set/train/wav_file.txt)"""
+
+    stypes, idxs = [], []
+    with open(pth, "r") as f:
+        for line in f:
+            line = line.strip()
+            if line == "":
+                continue
+            stype, idx = line.split("/")[-3], line.split("/")[-2]
+            stypes.append(stype)
+            idxs.append(idx)
+        return stypes, idxs
+
+
+def read_text(pth, rmspk=False):
+    """read dev set text data (data/SEAME-dev-set/{devset}/text)"""
+
+    idxs = []
+    with open(pth, "r") as f:
+        for line in f:
+            line = line.strip()
+            if line == "":
+                continue
+
+            line = line.split()[0]
+            if rmspk:
+                line = line.split("-", 1)[-1]
+            idxs.append(line.lower())
+        return idxs
+
+
+def read_trans(data_dict, pth, phs, audio_list, aduio_pth):
+    """read transcriptions (SEAME/{type}/transcript/phaseII/??.txt)"""
+
+    audio_dict = set(audio_list)
+
+    with open(pth, "r") as f:
+        for line in f:
+            line = line.strip()
+            if line == "":
+                continue
+
+            if phs.lower() == "phasei":
+                lang = None
+                if len(line.split("\t")) == 4:
+                    idx, start, end, text = line.split("\t")
+                else:
+                    idx, cont = line.split("\t", 1)
+                    print(f"Skip {idx} with {cont}... (no transcript)")
+                    continue
+            elif phs.lower() == "phaseii":
+                idx, start, end, lang, text = line.split("\t")
+            else:
+                print("folder error! not PhaseI or PhaseII")
+                raise
+            # start: start time in msec
+            # end: end time in msec
+
+            start_ms = start
+            end_ms = end
+
+            # fit the devset format
+            s_len, e_len = len(start), len(end)
+            if s_len < 5:
+                start = int(round(fit_format(start) / 10, 0))
+                start = str(start).zfill(5)
+            else:
+                start = int(round(float(start) / 10, 0))
+            if e_len < 5:
+                end = int(round(fit_format(end) / 10, 0))
+                end = str(end).zfill(5)
+            else:
+                end = int(round(float(end) / 10, 0))
+
+            name = f"{idx}-{start}-{end}"
+            if name not in data_dict:
+                if idx.split("_")[0][0].isdigit():
+                    spkr = idx.split("_")[0][2:-2].lower()
+                else:
+                    spkr = idx.split("_")[0][:5].lower()
+
+                if idx.split("-")[0] in audio_dict:
+                    apth = os.path.join(audio_pth, name.split("-")[0] + ".flac")
+                else:
+                    print("FLAC idx error!")
+                    raise
+
+                data_dict[name.lower()] = {
+                    "text": text,
+                    "start": start,
+                    "end": end,
+                    "speaker": spkr,
+                    "split": "train",
+                    "audio_pth": apth,
+                    "start_ms": start_ms,
+                    "end_ms": end_ms,
+                    "phase": phs,
+                }
+            else:
+                print("Repeated idx!")
+                raise
+
+
+def fit_format(digit):
+    """fit file name format"""
+    str_digit = str(float(digit) / 10.0)
+    if int(str_digit[-1]) >= 5:
+        return float(digit) + 1
+    else:
+        return float(digit)
+
+
+def check_audio(data_dict, audio_dict):
+    """check whether data_dict and audio_dict match"""
+    for key in data_dict.keys():
+        if key.split("-")[0] not in audio_dict:
+            print(f"key = {key} not in audio files")
+
+
+def check_test_split(test, data_dict, splitname):
+    """find testing data in data_dict"""
+
+    train_idx = []
+    data = list(data_dict.keys())
+    count = 0
+    space = {}
+    idx_space = {}
+    for key in data:
+        idx, start, end = key.split("-")
+        idx_space[idx] = idx_space.get(idx, []) + [[str(start), str(end)]]
+        space[idx] = space.get(idx, []) + [[float(start), float(end)]]
+
+    for key in test:
+        idx, start, end = key.split("-")
+        start, end = float(start), float(end)
+        for list_idx, time in enumerate(space[idx]):
+            if abs(start - time[0]) < 3 and abs(end - time[1]) < 3:
+                count += 1
+                time1, time2 = idx_space.get(idx)[list_idx]
+                data_dict[(f"{idx}-{time1}-{time2}")]["split"] = splitname
+                break
+
+    print(f"=> Test set = {count}/{len(test)}")
+
+
+def sieve_train(data_dict, train_dict):
+    """tag samples other than training or testing data"""
+
+    for key in data_dict.keys():
+        if data_dict[key]["split"] == "train" and key.split("-")[0] in train_dict:
+            continue
+        elif data_dict[key]["split"] in ["devman", "devsge"]:
+            continue
+        else:
+            data_dict[key]["split"] = "other"
+
+
+def split_val(data_dict, num_val=None):
+    """split train/val sets"""
+
+    count = 0
+    test_list = []
+    tr_list = []
+    for key, content in data_dict.items():
+        if content["split"] in {"devman", "devsge"}:
+            test_list.append(key)
+        elif content["split"] == "train":
+            tr_list.append(key)
+
+    rd.shuffle(tr_list)
+    val_len = num_val if num_val else int(len(tr_list) * 0.05)
+    tr_list, val_list = tr_list[:-val_len], tr_list[-val_len:]
+
+    for key in val_list:
+        data_dict[key]["split"] = "valid"
+
+    return data_dict, tr_list, val_list, test_list
+
+
+def count_data(data_dict):
+    """count audio length and number of speakers"""
+
+    lens = {"train": 0.0, "valid": 0.0, "devman": 0.0, "devsge": 0.0, "other": 0.0}
+    spkr_dict = {
+        "train": set(),
+        "valid": set(),
+        "devman": set(),
+        "devsge": set(),
+        "other": set(),
+    }
+    for key, val in data_dict.items():
+        lens[val["split"]] += (float(val["end_ms"]) - float(val["start_ms"])) / 1000.0
+        spkr_dict[val["split"]].add(val["speaker"])
+
+    for key in lens.keys():
+        print(
+            "=> {} set : {:.2f} hours / {} speakers".format(
+                key, lens[key] / 3600.0, len(spkr_dict[key])
+            )
+        )
+
+
+def write_f(pth, filename, data_dict):
+    """write kaldi-compatible files"""
+
+    print(f"=> Writing {filename}...")
+    idx_pth = os.path.join(pth, "list")
+    txt_pth = os.path.join(pth, "text.ori")
+    rmtxt_pth = os.path.join(pth, "text.rm")
+    idxtxt_pth = os.path.join(pth, "text.clean")
+    idxnoisetxt_pth = os.path.join(pth, "text.rm.noise")
+    seg_pth = os.path.join(pth, "segments")
+    wav_pth = os.path.join(pth, "wav.scp")
+    spk_pth = os.path.join(pth, "utt2spk")
+    gender_pth = os.path.join(pth, "spk2gender")
+    wav_cmds = {}
+    gender = {}
+    total_len = 0.0
+    total_utt = 0
+
+    # write idx list
+    with open(txt_pth, "w") as tlist:
+        with open(rmtxt_pth, "w") as rtlist:
+            with open(idxtxt_pth, "w") as itlist:
+                with open(idxnoisetxt_pth, "w") as intlist:
+                    with open(seg_pth, "w") as slist:
+                        with open(idx_pth, "w") as flist:
+                            with open(wav_pth, "w") as wlist:
+                                with open(spk_pth, "w") as spklist:
+                                    with open(gender_pth, "w") as genlist:
+                                        for idx, content in data_dict.items():
+                                            if filename != content["split"]:
+                                                continue
+
+                                            # id & text
+                                            text = content["text"]
+                                            audio_pth = content["audio_pth"]
+                                            spkr = content["speaker"]
+
+                                            # process text
+                                            normalized_text = normalize_text(text)
+                                            no_noise_text = normalized_text.replace(
+                                                "<noise>", ""
+                                            ).replace("<unk>", "")
+                                            no_noise_text = (
+                                                remove_redundant_whitespaces(
+                                                    no_noise_text
+                                                )
+                                            )
+                                            normalized_text = normalized_text.replace(
+                                                "<unk>", "<UNK>"
+                                            )
+
+                                            # remove short utterances
+                                            if len(no_noise_text) == 0:
+                                                continue
+
+                                            # fit kaldi format
+                                            prefix, id_start, id_end = idx.split("-")
+
+                                            # remove some short utterance
+                                            if float(id_end) - float(id_start) <= 1:
+                                                continue
+                                            idx = (
+                                                prefix
+                                                + "-"
+                                                + "0" * (6 - len(id_start))
+                                                + id_start
+                                                + "-"
+                                                + "0" * (6 - len(id_end))
+                                                + id_end
+                                            )
+
+                                            uttidx = f"{spkr}-{idx}"
+                                            if spkr[-1] in ["m", "f"]:
+                                                gender[spkr] = spkr[-1]
+                                            else:
+                                                # some SEAME's bug
+                                                for g in reversed(prefix.split("_")[0]):
+                                                    if g.lower() in ["m", "f"]:
+                                                        gender[spkr] = g.lower()
+
+                                            spklist.write(f"{uttidx} {spkr}\n")
+                                            flist.write(f"{uttidx}\n")
+                                            tlist.write(f"{uttidx} {text}\n")
+                                            _, recordid, start, end = uttidx.split("-")
+                                            wav_cmds[
+                                                recordid
+                                            ] = f"flac -c -d -s {audio_pth} |"
+                                            # map to sec, original ms,
+                                            # idx here has /10.
+                                            start, end = (
+                                                float(start) / 100,
+                                                float(end) / 100,
+                                            )
+                                            # write segments
+                                            slist.write(
+                                                f"{uttidx} {recordid} {start} {end}\n"
+                                            )
+
+                                            rtlist.write(normalized_text + "\n")
+                                            intlist.write(
+                                                f"{uttidx} {normalized_text}\n"
+                                            )
+
+                                            itlist.write(f"{uttidx} {no_noise_text}\n")
+
+                                            total_len += end - start
+                                            total_utt += 1
+
+                                        for recordid in sorted(wav_cmds.keys()):
+                                            wlist.write(
+                                                f"{recordid} {wav_cmds[recordid]}\n"
+                                            )
+
+                                        for spkr in sorted(gender.keys()):
+                                            genlist.write(f"{spkr} {gender[spkr]}\n")
+
+    print(
+        "=>    {}: {} utts, {:.2f} hours, avg {:.2f} sec/utt".format(
+            filename, total_utt, total_len / 3600.0, total_len / total_utt
+        )
+    )
+
+
+def write_mandarin_only_text(data_dict, file, char_file1, char_file2):
+    """write Mandarin text data"""
+
+    counter = collections.Counter()
+    with open(file, "w") as fp:
+        for idx, content in data_dict.items():
+            if "train" == content["split"]:
+                text = normalize_text(content["text"])
+                text = text.replace("<noise>", "")
+                text = text.replace("<unk>", "")
+                text = remove_redundant_whitespaces(text)
+                text = extract_mandarin_only(text)
+                counter.update(text)
+                if text != "":
+                    fp.write(text + "\n")
+
+    vocab_list = sorted(counter.keys())
+    print(f"=> Mandarin vocab size = {len(vocab_list)}")
+
+    with open(char_file1, "w") as fp:
+        fp.write("\n".join(vocab_list))
+    with open(char_file2, "w") as fp:
+        fp.write('bpe_nlsyms="<noise>,▁' + ",▁".join(vocab_list) + '"\n')
+        fp.write(f"man_chars={len(vocab_list)}")
+
+
+def write_bpe_train_text(data_dict, file):
+    """write English BPE training text data"""
+
+    with open(file, "w") as fp:
+        for idx, content in data_dict.items():
+            if "train" == content["split"]:
+                text = normalize_text(content["text"])
+                text = text.replace("<noise>", "")
+                text = text.replace("<unk>", "")
+                text = remove_redundant_whitespaces(text)
+                text = extract_non_mandarin(text)
+                if text != "":
+                    fp.write(text + "\n")
+
+
+if __name__ == "__main__":
+    # parse arguments
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--out",
+        "-o",
+        type=str,
+        help="Path to output directory.",
+    )
+    parser.add_argument("--data", "-d", type=str, help="Path to original corpus.")
+    parser.add_argument(
+        "--repo", "-r", type=str, help="Path to official repo (downloaded)."
+    )
+    args = parser.parse_args()
+
+    # basic variables setup
+    out_pth = args.out
+    ori_data_pth = args.data
+
+    # read data
+    print("=> Preprocessing transcription files...")
+    audio_type = ["conversation", "interview"]
+    audios, trans = [], []
+    data_dict, audio_idx_list = {}, []
+    for atp in audio_type:
+        # read audio
+        audio_pth = os.path.abspath(os.path.join(ori_data_pth, atp, "audio"))
+        for au in os.listdir(os.path.join(ori_data_pth, atp, "audio")):
+            audios.append(au.strip(".flac"))
+            audio_idx_list.append(au.split("/")[-1].strip(".flac").lower())
+
+        # read transcription
+        for phs in ["phaseII"]:
+            for txt in os.listdir(os.path.join(ori_data_pth, atp, "transcript", phs)):
+                trans_pth = os.path.join(ori_data_pth, atp, "transcript", phs, txt)
+                read_trans(data_dict, trans_pth, phs, audios, audio_pth)
+
+    # check whether the audio file exists for each utterance
+    print("=> Checking audio files...")
+    check_audio(data_dict, set(audio_idx_list))
+
+    # get train set
+    print("=> Reading wav_file.txt for training set...")
+    all_audio_pth = os.path.join(args.repo, "train", "wav_file.txt")
+    folder_type, all_audio_idx = read_list(all_audio_pth)
+
+    print("=> Getting training set...")
+    sieve_train(data_dict, set(all_audio_idx))
+
+    # dev set
+    print("=> Reading dev set indices...")
+    rmspk = True
+    dev_man = os.path.join(args.repo, "dev_man", "text")
+    devman_idx = read_text(dev_man, rmspk)
+
+    dev_sge = os.path.join(args.repo, "dev_sge", "text")
+    devsge_idx = read_text(dev_sge, rmspk)
+
+    # check
+    print("=> Checking testing sets...")
+    check_test_split(devman_idx, data_dict, "devman")
+    check_test_split(devsge_idx, data_dict, "devsge")
+
+    # split
+    print("=> Splitting train/val sets...")
+    data_dict, tr_list, val_list, test_list = split_val(data_dict)
+
+    # report some results
+    print(f"=> Audio files = {len(audios)}")
+    print(f"=> Total utterance = {len(data_dict.keys())}")
+    print(f"=> Number of train set = {len(tr_list)}; validation set = {len(val_list)}")
+    print(f"=> Number of devman = {len(devman_idx)}; devsge = {len(devsge_idx)}")
+
+    # report corpus size (in hours)
+    count_data(data_dict)
+
+    # sort by speaker
+    print("=> Sorting data by speaker id...")
+    data_idx = []
+    spkr_dict = collections.OrderedDict([])
+    for k, v in data_dict.items():
+        speaker = data_dict[k]["speaker"]
+        spkr_dict[speaker] = spkr_dict.get(speaker, []) + [k]
+    for k in sorted(spkr_dict.keys()):
+        data_idx += sorted(spkr_dict[k])
+
+    sorted_idx = []
+    prev_name = None
+    buff = {}
+    for idx in data_idx:
+        name, start = idx.split("-")[0], idx.split("-")[1]
+        if prev_name:
+            if prev_name == name:
+                buff[int(start)] = idx
+            else:
+                sorted_idx += [buff[k] for k in sorted(buff.keys())]
+                # clean buff
+                buff = {int(start): idx}
+                prev_name = name
+        else:
+            prev_name = name
+            buff = {int(start): idx}
+    sorted_data_dict = collections.OrderedDict()
+    for key in sorted_idx:
+        sorted_data_dict[key] = data_dict[key]
+
+    # make kaldi format files
+    print("=> Writing files...")
+    for name in ["train", "valid", "devman", "devsge"]:
+        data_pth = os.path.join(out_pth, name)
+        os.makedirs(data_pth, exist_ok=True)
+        write_f(data_pth, name, sorted_data_dict)
+
+    write_mandarin_only_text(
+        sorted_data_dict,
+        os.path.join(out_pth, "train", "text.man"),
+        os.path.join(out_pth, "train", "token.man.1"),
+        os.path.join(out_pth, "train", "token.man.2"),
+    )
+
+    write_bpe_train_text(
+        sorted_data_dict,
+        os.path.join(out_pth, "train", "text.eng.bpe"),
+    )
diff --git a/egs2/seame/asr1/local/score.sh b/egs2/seame/asr1/local/score.sh
new file mode 100755
index 00000000000..ac7a1784b2e
--- /dev/null
+++ b/egs2/seame/asr1/local/score.sh
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+# This script computes CER of Mandarin and WER of English separately
+
+if [ $# -eq 1 ]; then
+    exp=$1
+else
+    echo "only one argument is required"
+fi
+
+while IFS= read -r expdir; do
+    if ls "${expdir}"/*/*/score_wer/hyp.trn &> /dev/null; then
+        for scoredir in "${expdir}"/*/*/score_wer; do
+            # split Mandarin and English transcriptions
+            local/split_lang_trn.py -t ${scoredir}/hyp.trn -o ${scoredir}
+            local/split_lang_trn.py -t ${scoredir}/ref.trn -o ${scoredir}
+
+            # respectively computes the error rates
+            for lang in eng man; do
+                sclite -e utf-8 -c NOASCII \
+                    -r "${scoredir}/ref.trn.${lang}" trn \
+                    -h "${scoredir}/hyp.trn.${lang}" trn \
+                    -i rm -o all stdout \
+                    > "${scoredir}/result.${lang}.txt"
+            done
+        done
+
+        # show results
+        for lang in eng man; do
+            if [ $lang = eng ]; then
+                echo "English WER"
+            else
+                echo "Mandarin CER"
+            fi
+
+            echo "|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|"
+            grep -H -e Avg "${expdir}"/*/*/score_wer/result.${lang}.txt \
+                | sed -e "s#${expdir}/\([^/]*/[^/]*\)/score_wer/result.${lang}.txt:#|\1#g" \
+                | sed -e 's#Sum/Avg##g' | tr '|' ' ' | tr -s ' ' '|'
+            echo
+        done
+    fi
+done < <(find ${exp} -mindepth 0 -maxdepth 1 -type d)
diff --git a/egs2/seame/asr1/local/split_lang_trn.py b/egs2/seame/asr1/local/split_lang_trn.py
new file mode 100755
index 00000000000..1cff8674a73
--- /dev/null
+++ b/egs2/seame/asr1/local/split_lang_trn.py
@@ -0,0 +1,39 @@
+#!/usr/bin/env python3
+# -*- encoding: utf8 -*-
+
+import os
+import argparse
+
+from preprocess import (
+    remove_redundant_whitespaces,
+    extract_mandarin_only,
+    extract_non_mandarin,
+    insert_space_between_mandarin,
+)
+
+
+if __name__ == "__main__":
+    # Parse arguments
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--trn", "-t", type=str, help=".trn file")
+    parser.add_argument("--out", "-o", type=str, help="Output dir.")
+    args = parser.parse_args()
+
+    out_name = args.trn.split("/")[-1]  # hyp.trn / ref.trn
+    eng_out_path = os.path.join(args.out, out_name + ".eng")
+    man_out_path = os.path.join(args.out, out_name + ".man")
+
+    with open(args.trn, "r") as fp:
+        with open(eng_out_path, "w") as fp_eng:
+            with open(man_out_path, "w") as fp_man:
+                for line in fp:
+                    sent, idx = line.split("\t")
+
+                    sent_eng = extract_non_mandarin(sent)
+                    sent_man = extract_mandarin_only(sent)
+                    sent_man = insert_space_between_mandarin(sent_man)
+                    sent_eng = remove_redundant_whitespaces(sent_eng)
+                    sent_man = remove_redundant_whitespaces(sent_man)
+
+                    fp_eng.write(sent_eng + "\t" + idx)
+                    fp_man.write(sent_man + "\t" + idx)
diff --git a/egs2/seame/asr1/path.sh b/egs2/seame/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/seame/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/seame/asr1/pyscripts b/egs2/seame/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/seame/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/seame/asr1/run.sh b/egs2/seame/asr1/run.sh
new file mode 100755
index 00000000000..45a429aa542
--- /dev/null
+++ b/egs2/seame/asr1/run.sh
@@ -0,0 +1,54 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+stage=1
+stop_stage=13
+
+train_set="train"
+valid_set="valid"
+test_sets="devman devsge"
+
+asr_config=conf/tuning/train_asr_conformer.yaml
+lm_config=conf/tuning/train_lm_transformer.yaml
+inference_config=conf/decode_asr.yaml
+
+if [ ! -f "data/train/token.man.2" ]; then
+    # must preprocess data first to get Mandarin character tokens
+    if [ ${stage} -eq 1 ]; then
+        ./asr.sh --stage 1 --stop_stage 1
+        stage=2
+    else
+        echo "Error: data/train/token.man.2 does not exist! Run from stage=1 again."
+        exit 1
+    fi
+fi
+
+man_chars=2622
+bpe_nlsyms=""
+
+source data/train/token.man.2  # for bpe_nlsyms & man_chars
+nbpe=$((3000 + man_chars + 4))  # 5626
+# English BPE: 3000 / Mandarin: 2622 / other symbols: 4
+
+./asr.sh \
+    --ngpu 2 \
+    --stage ${stage} \
+    --stop_stage ${stop_stage} \
+    --nbpe ${nbpe} \
+    --bpe_nlsyms "${bpe_nlsyms}" \
+    --speed_perturb_factors "0.9 1.0 1.1" \
+    --max_wav_duration 30 \
+    --asr_config "${asr_config}" \
+    --lm_config "${lm_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --lm_train_text "data/${train_set}/text" \
+    --bpe_train_text "data/${train_set}/text.eng.bpe" \
+    --score_opts "-e utf-8 -c NOASCII" \
+    "$@"
diff --git a/egs2/seame/asr1/scripts b/egs2/seame/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/seame/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/seame/asr1/steps b/egs2/seame/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/seame/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/seame/asr1/utils b/egs2/seame/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/seame/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/sinhala/asr1/README.md b/egs2/sinhala/asr1/README.md
new file mode 100644
index 00000000000..5a55cc82373
--- /dev/null
+++ b/egs2/sinhala/asr1/README.md
@@ -0,0 +1,43 @@
+<!-- Generated by ./scripts/utils/show_asr_result.sh -->
+# RESULTS
+
+## Environments
+- date: `Wed Dec 22 00:25:08 EST 2021`
+- python version: `3.9.5 (default, Jun  4 2021, 12:28:51)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.3a3`
+- pytorch version: `pytorch 1.8.1+cu102`
+- Git hash: `ea2174b5cef70a9b65a64980ef56283222283af2`
+  - Commit date: `Thu Sep 16 21:19:45 2021 +0900` 
+ 
+## Dataset 
+- Dataset Link: https://drive.google.com/file/d/17_e0JhMW4_FPxfh93foplnxb4OQp8zh3/view?usp=sharing
+- Licence Link: http://rtuthaya.lk/wp-content/uploads/2019/11/LICENSE.txt
+
+## Using Transformer based encoder-decoder predicting transcript along with intent
+- ASR config: [conf/tuning/train_asr.yaml](conf/tuning/train_asr.yaml)
+- token_type: word
+- keep_nbest_models: 5
+
+|dataset|Snt|Intent Classification (%)|
+|---|---|---|
+|inference_asr_model_valid.acc.ave_5best/test|762|94.88|
+|inference_asr_model_valid.acc.ave_5best/valid|763|93.577|
+
+## asr_train_asr_raw_en_word
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|inference_asr_model_valid.acc.ave_5best/test|762|15965|97.2|1.8|1.0|1.3|4.1|13.9|
+|inference_asr_model_valid.acc.ave_5best/valid|763|15853|96.4|2.5|1.1|1.8|5.5|16.3|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|inference_asr_model_valid.acc.ave_5best/test|762|41332|97.8|1.4|0.8|1.0|3.2|13.9|
+|inference_asr_model_valid.acc.ave_5best/valid|763|41124|97.3|1.9|0.8|1.4|4.1|16.3|
+
+
+### Pre-trained Model
+The pre-trained model has been uploaded to HuggingFace [ESPNet Model](https://huggingface.co/espnet/Karthik_sinhala_asr_train_asr_transformer)
diff --git a/egs2/sinhala/asr1/asr.sh b/egs2/sinhala/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/sinhala/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/sinhala/asr1/cmd.sh b/egs2/sinhala/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/sinhala/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/sinhala/asr1/conf/fbank.conf b/egs2/sinhala/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/sinhala/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/sinhala/asr1/conf/pbs.conf b/egs2/sinhala/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/sinhala/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/sinhala/asr1/conf/pitch.conf b/egs2/sinhala/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/sinhala/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/sinhala/asr1/conf/queue.conf b/egs2/sinhala/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/sinhala/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/sinhala/asr1/conf/slurm-2.conf b/egs2/sinhala/asr1/conf/slurm-2.conf
new file mode 100644
index 00000000000..71200d8b9b7
--- /dev/null
+++ b/egs2/sinhala/asr1/conf/slurm-2.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p RM-shared
+option gpu=* -p GPU-shared --gres=gpu:v100-16:$0 # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/sinhala/asr1/conf/slurm.conf b/egs2/sinhala/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/sinhala/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/sinhala/asr1/conf/train_asr.yaml b/egs2/sinhala/asr1/conf/train_asr.yaml
new file mode 100644
index 00000000000..4ebab74c499
--- /dev/null
+++ b/egs2/sinhala/asr1/conf/train_asr.yaml
@@ -0,0 +1,48 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+optim: adam
+optim_conf:
+    lr: 0.0002
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
+max_epoch: 50
+keep_nbest_models: 5
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/sinhala/asr1/db.sh b/egs2/sinhala/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/sinhala/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/sinhala/asr1/local/data.sh b/egs2/sinhala/asr1/local/data.sh
new file mode 100755
index 00000000000..d2e9968cfad
--- /dev/null
+++ b/egs2/sinhala/asr1/local/data.sh
@@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+
+stage=1
+stop_stage=100000
+log "$0 $*"
+. utils/parse_options.sh
+
+. ./db.sh
+. ./path.sh
+. ./cmd.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+if [ -z "${SINHALA}" ]; then
+    log "Fill the value of 'SINHALA' of db.sh"
+    exit 1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    if [ ! -e "${SINHALA}/LICENCE" ]; then
+	echo "stage 1: Download data to ${SINHALA}"
+    else
+        log "stage 1: ${SINHALA}/LICENCE is already existing. Skip data downloading"
+    fi
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: Data Preparation"
+    mkdir -p data/{train,valid,test}
+    python3 local/data_prep.py ${SINHALA}
+    for x in test valid train; do
+        for f in text wav.scp utt2spk; do
+            sort data/${x}/${f} -o data/${x}/${f}
+        done
+        utils/utt2spk_to_spk2utt.pl data/${x}/utt2spk > "data/${x}/spk2utt"
+        utils/validate_data_dir.sh --no-feats data/${x} || exit 1
+    done
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/sinhala/asr1/local/data_prep.py b/egs2/sinhala/asr1/local/data_prep.py
new file mode 100644
index 00000000000..ba218d62260
--- /dev/null
+++ b/egs2/sinhala/asr1/local/data_prep.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env bash
+
+# Copyright 2021  Karthik Ganesan
+#           2021  Carnegie Mellon University
+# Apache 2.0
+
+
+import os
+import re
+import sys
+import pandas as pd
+from tqdm import tqdm
+import pandas as pd
+import os
+import numpy as np
+from sklearn.model_selection import train_test_split
+
+
+if len(sys.argv) != 2:
+    print("Usage: python data_prep.py [SINHALA]")
+    sys.exit(1)
+sinhala_root = sys.argv[1]
+
+
+def read_sinhala_data(audio_csv, sentences_csv, export_csv):
+    sent_df = pd.read_csv(sentences_csv)
+    data = pd.read_csv(audio_csv)
+    output_df = []
+
+    if not os.path.exists("wavs"):
+        os.mkdir("wavs")
+
+    for i in range(len(sent_df)):
+        intent, intent_details, inflection, transcript = (
+            sent_df.iloc[i]["intent"],
+            sent_df.iloc[i]["intent_details"],
+            sent_df.iloc[i]["inflection"],
+            sent_df.iloc[i]["sentence"],
+        )
+
+        for j in range(len(data)):
+            wav_name, intent_, inflection_ = (
+                data.iloc[j]["audio_file"],
+                data.iloc[j]["intent"],
+                data.iloc[j]["inflection"],
+            )
+            if intent_ == intent and inflection_ == inflection:
+                # clean transcript
+                # export audio of for the crop with wav_path_start_duration
+                export_path = os.path.join("wavs", wav_name)
+                # Append to output_df
+                output_df.append(
+                    [
+                        export_path,
+                        "unknown",
+                        transcript,
+                        intent_details.replace(" ", ""),
+                    ]
+                )
+
+    X = pd.DataFrame(
+        output_df, columns=["path", "speakerId", "transcription", "task_type"]
+    )
+    Y = X.pop("task_type").to_frame()
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, Y, stratify=Y, test_size=0.20, random_state=42
+    )
+    X_test, X_val, y_test, y_val = train_test_split(
+        X_test, y_test, stratify=y_test, test_size=0.50, random_state=42
+    )
+    pd.concat([X_train, y_train], axis=1).to_csv("train.csv")
+    pd.concat([X_test, y_test], axis=1).to_csv("test.csv")
+    pd.concat([X_val, y_val], axis=1).to_csv("validation.csv")
+
+
+read_sinhala_data(
+    os.path.join(sinhala_root, "Sinhala_Data.csv"),
+    os.path.join(sinhala_root, "Sinhala_Sentences.csv"),
+    os.path.join(sinhala_root, "export.csv"),
+)
+
+
+dir_dict = {
+    "train": "train.csv",
+    "valid": "validation.csv",
+    "test": "test.csv",
+}
+
+for x in dir_dict:
+    with open(os.path.join("data", x, "text"), "w") as text_f, open(
+        os.path.join("data", x, "wav.scp"), "w"
+    ) as wav_scp_f, open(
+        os.path.join("data", x, "transcript"), "w"
+    ) as transcript_f, open(
+        os.path.join("data", x, "utt2spk"), "w"
+    ) as utt2spk_f:
+
+        text_f.truncate()
+        wav_scp_f.truncate()
+        utt2spk_f.truncate()
+        transcript_df = pd.read_csv(os.path.join(sinhala_root, "data", dir_dict[x]))
+        # lines = sorted(transcript_df.values, key=lambda s: s[0])
+        for row in transcript_df.values:
+            words = row[4].replace(" ", "_") + " " + " ".join([ch for ch in row[3]])
+            path_arr = row[1].split("/")
+            utt_id = path_arr[-2] + "_" + path_arr[-1]
+            text_f.write(utt_id + " " + words + "\n")
+            wav_scp_f.write(utt_id + " " + hyper_root + "/" + row[1] + "\n")
+            utt2spk_f.write(utt_id + " " + row[2] + "\n")
diff --git a/egs2/sinhala/asr1/local/path.sh b/egs2/sinhala/asr1/local/path.sh
new file mode 100755
index 00000000000..e69de29bb2d
diff --git a/egs2/sinhala/asr1/local/score.py b/egs2/sinhala/asr1/local/score.py
new file mode 120000
index 00000000000..32c1318cf74
--- /dev/null
+++ b/egs2/sinhala/asr1/local/score.py
@@ -0,0 +1 @@
+../../../TEMPLATE/asr1/pyscripts/utils/score_intent.py
\ No newline at end of file
diff --git a/egs2/sinhala/asr1/local/score.sh b/egs2/sinhala/asr1/local/score.sh
new file mode 120000
index 00000000000..91c8680b9b9
--- /dev/null
+++ b/egs2/sinhala/asr1/local/score.sh
@@ -0,0 +1 @@
+../../../fsc/asr1/local/score.sh
\ No newline at end of file
diff --git a/egs2/sinhala/asr1/path.sh b/egs2/sinhala/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/sinhala/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/sinhala/asr1/pyscripts b/egs2/sinhala/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/sinhala/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/sinhala/asr1/run.sh b/egs2/sinhala/asr1/run.sh
new file mode 100755
index 00000000000..875f55cb916
--- /dev/null
+++ b/egs2/sinhala/asr1/run.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set="train"
+valid_set="valid"
+test_sets="test valid"
+
+asr_config=conf/train_asr.yaml
+
+./asr.sh \
+    --lang en \
+    --ngpu 1 \
+    --stage 1 \
+    --use_lm false \
+    --nbpe 5000 \
+    --token_type word\
+    --audio_format wav\
+    --feats_type raw\
+    --max_wav_duration 30 \
+    --inference_asr_model valid.acc.ave_5best.pth\
+    --asr_config "${asr_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" "$@"
diff --git a/egs2/sinhala/asr1/scripts b/egs2/sinhala/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/sinhala/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/sinhala/asr1/steps b/egs2/sinhala/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/sinhala/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/sinhala/asr1/utils b/egs2/sinhala/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/sinhala/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/slue-voxceleb/asr1/README.md b/egs2/slue-voxceleb/asr1/README.md
new file mode 100644
index 00000000000..3da3f39a411
--- /dev/null
+++ b/egs2/slue-voxceleb/asr1/README.md
@@ -0,0 +1,46 @@
+<!-- Generated by ./scripts/utils/show_asr_result.sh -->
+# RESULTS
+
+## Environments
+- date: `Tue Dec 28 12:28:28 EST 2021`
+- python version: `3.9.5 (default, Jun  4 2021, 12:28:51) [GCC 7.5.0]`
+- espnet version: `espnet 0.10.3a2`
+- pytorch version: `pytorch 1.8.1+cu102`
+- Git hash: `6bf3c2a4f138d35331634d2e879bbc5c32a5266e`
+  - Commit date: `Mon Dec 22 15:41:32 EST 2021`
+
+
+## Using Conformer based encoder and Transformer based decoder with spectral augmentation and predicting transcript along with intent
+- ASR config: [conf/tuning/train_asr_conformer.yaml]
+- token_type: word
+- Pretrained Model
+  - Hugging Face : https://huggingface.co/espnet/siddhana_slue_asr_train_asr_conformer_raw_en_word_valid.acc.ave_10best
+
+|dataset|Snt|Intent Classification Accuracy (%)|Intent Classification Macro F1 (%)|
+|---|---|---|---|
+|inference_asr_model_valid.acc.ave_10best/devel|954|80.2|39.7|
+
+### Detailed Classification Report
+
+|dataset|Label|Snt|Prec|Recall|F1|
+|---|---|---|---|---|---|
+|inference_asr_model_valid.acc.ave_10best/devel|Neutral|784|85|93|89|
+|inference_asr_model_valid.acc.ave_10best/devel|Positive|167|40|24|30|
+|inference_asr_model_valid.acc.ave_10best/devel|Negative|3|0|0|0|
+
+## Using Conformer based encoder with output size 256 and Transformer based decoder with spectral augmentation and predicting transcript along with intent
+- ASR config: [conf/train_asr.yaml](conf/tuning/train_asr_wav2vec2_conformer_small.yaml)
+- token_type: word
+
+|dataset|Snt|Intent Classification Accuracy (%)|Intent Classification Macro F1 (%)|
+|---|---|---|---|
+|inference_asr_model_valid.acc.ave_10best/devel|954|79.0|44.0|
+
+### Detailed Classification Report
+
+|dataset|Label|Snt|Prec|Recall|F1|
+|---|---|---|---|---|---|
+|inference_asr_model_valid.acc.ave_10best/devel|Neutral|784|88|87|87|
+|inference_asr_model_valid.acc.ave_10best/devel|Positive|167|46|43|44|
+|inference_asr_model_valid.acc.ave_10best/devel|Negative|3|0|0|0|
+
diff --git a/egs2/slue-voxceleb/asr1/conf/train_asr.yaml b/egs2/slue-voxceleb/asr1/conf/train_asr.yaml
index 10fea4f0971..b79a2904e79 120000
--- a/egs2/slue-voxceleb/asr1/conf/train_asr.yaml
+++ b/egs2/slue-voxceleb/asr1/conf/train_asr.yaml
@@ -1 +1 @@
-tuning/train_asr_conformer.yaml
\ No newline at end of file
+tuning/train_asr_wav2vec2_conformer_small.yaml
\ No newline at end of file
diff --git a/egs2/slue-voxceleb/asr1/conf/tuning/train_asr_wav2vec2_conformer_small.yaml b/egs2/slue-voxceleb/asr1/conf/tuning/train_asr_wav2vec2_conformer_small.yaml
new file mode 100644
index 00000000000..fe8f902cb63
--- /dev/null
+++ b/egs2/slue-voxceleb/asr1/conf/tuning/train_asr_wav2vec2_conformer_small.yaml
@@ -0,0 +1,90 @@
+# network architecture
+# encoder related
+batch_type: numel
+batch_bins: 6000000
+encoder: conformer
+accum_grad: 2
+
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d2
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+optim: adam
+optim_conf:
+    lr: 0.002
+    weight_decay: 1.0e-06
+scheduler: warmuplr     # pytorch v1.1.0+ required #Tune warmup steps
+scheduler_conf:
+    warmup_steps: 5000
+max_epoch: 100
+
+freeze_param: [
+"frontend.upstream"
+]
+
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+frontend: s3prl
+frontend_conf:
+    frontend_conf:
+        upstream: wav2vec2_large_ll60k  # Note: If the upstream is changed, please change the input_size in the preencoder.
+    download_dir: ./hub
+    multilayer_feature: True
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 80
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
diff --git a/egs2/slue-voxceleb/asr1/local/data.sh b/egs2/slue-voxceleb/asr1/local/data.sh
index b42d26fe50c..3f266008f4f 100755
--- a/egs2/slue-voxceleb/asr1/local/data.sh
+++ b/egs2/slue-voxceleb/asr1/local/data.sh
@@ -52,7 +52,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     done
     local/run_spm.sh
     mv data data_old
-    mv data_bpe_500 data
+    mv data_bpe_1000 data
 fi
 
 log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/slue-voxceleb/asr1/local/data_prep_slue.py b/egs2/slue-voxceleb/asr1/local/data_prep_slue.py
index 524aa12b208..89b42059e30 100644
--- a/egs2/slue-voxceleb/asr1/local/data_prep_slue.py
+++ b/egs2/slue-voxceleb/asr1/local/data_prep_slue.py
@@ -42,6 +42,8 @@
                     "<blank>"  # Test set is blind, will have to submit to leaderboard
                 )
             else:
+                if row[4] == "<mixed>":
+                    continue
                 print(x)
                 print(row)
                 words = (
diff --git a/egs2/slue-voxceleb/asr1/local/f1_score.py b/egs2/slue-voxceleb/asr1/local/f1_score.py
new file mode 100755
index 00000000000..4f45752a812
--- /dev/null
+++ b/egs2/slue-voxceleb/asr1/local/f1_score.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env bash
+
+# Copyright 2021  Siddhant Arora
+#           2021  Carnegie Mellon University
+# Apache 2.0
+
+
+import os
+import re
+import sys
+import pandas as pd
+import argparse
+from sklearn.metrics import f1_score
+from sklearn.metrics import classification_report
+
+
+def get_classification_result(hyp_file, ref_file):
+    hyp_lines = [line for line in hyp_file]
+    ref_lines = [line for line in ref_file]
+
+    error = 0
+    hyp_intent_arr = []
+    ref_intent_arr = []
+    for line_count in range(len(hyp_lines)):
+        hyp_intent = hyp_lines[line_count].split(" ")[0]
+        ref_intent = ref_lines[line_count].split(" ")[0]
+        hyp_intent_arr.append(hyp_intent)
+        ref_intent_arr.append(ref_intent)
+    print(classification_report(ref_intent_arr, hyp_intent_arr))
+    return f1_score(ref_intent_arr, hyp_intent_arr, average="macro")
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--exp_root", required=True, help="Directory to save experiments")
+parser.add_argument(
+    "--valid_folder",
+    default="inference_asr_model_valid.acc.ave_10best/devel/",
+    help="Directory inside exp_root containing inference on valid set",
+)
+parser.add_argument(
+    "--test_folder",
+    default="inference_asr_model_valid.acc.ave_10best/test/",
+    help="Directory inside exp_root containing inference on test set",
+)
+parser.add_argument(
+    "--utterance_test_folder",
+    default=None,
+    help="Directory inside exp_root containing inference on utterance test set",
+)
+
+args = parser.parse_args()
+
+exp_root = args.exp_root
+valid_inference_folder = args.valid_folder
+test_inference_folder = args.test_folder
+
+valid_hyp_file = open(
+    os.path.join(exp_root, valid_inference_folder + "score_wer/hyp.trn")
+)
+valid_ref_file = open(
+    os.path.join(exp_root, valid_inference_folder + "score_wer/ref.trn")
+)
+
+result = get_classification_result(valid_hyp_file, valid_ref_file)
+print("Valid Macro F1")
+print(result)
+
+test_hyp_file = open(
+    os.path.join(exp_root, test_inference_folder + "score_wer/hyp.trn")
+)
+test_ref_file = open(
+    os.path.join(exp_root, test_inference_folder + "score_wer/ref.trn")
+)
+
+result = get_classification_result(test_hyp_file, test_ref_file)
+print("Test Intent Macro F1")
+print(result)
+
+if args.utterance_test_folder is not None:
+    utt_test_inference_folder = args.utterance_test_folder
+    utt_test_hyp_file = open(
+        os.path.join(exp_root, utt_test_inference_folder + "score_wer/hyp.trn")
+    )
+    utt_test_ref_file = open(
+        os.path.join(exp_root, utt_test_inference_folder + "score_wer/ref.trn")
+    )
+    result = get_classification_result(utt_test_hyp_file, utt_test_ref_file)
+    print("Unseen Utterance Test Macro F1")
+    print(result)
diff --git a/egs2/slue-voxceleb/asr1/local/generate_asr_files.py b/egs2/slue-voxceleb/asr1/local/generate_asr_files.py
new file mode 100644
index 00000000000..dd8a4645410
--- /dev/null
+++ b/egs2/slue-voxceleb/asr1/local/generate_asr_files.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env bash
+
+# Copyright 2021  Siddhant Arora
+#           2021  Carnegie Mellon University
+# Apache 2.0
+
+
+import os
+import re
+import sys
+import pandas as pd
+import argparse
+
+
+def generate_asr_files(txt_file, transcript_file):
+    line_arr = [line for line in txt_file]
+    for line in line_arr:
+        if len(line.split("\t")) > 2:
+            print(line)
+            exit()
+        if len(line.split("\t")[0].split()) == 1:
+            text = "<blank>"
+        else:
+            text = line.split("\t")[0].split()[1].replace("▁", "")
+        for sub_word in line.split("\t")[0].split()[2:]:
+            if "▁" in sub_word:
+                text = text + " " + sub_word.replace("▁", "")
+            else:
+                text = text + sub_word
+        if len(text) == 0:
+            text = "<blank>"
+        wav_name = line.split("\t")[1]
+        transcript_file.write(text + "\t" + wav_name)
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--exp_root", required=True, help="Directory to save experiments")
+parser.add_argument(
+    "--valid_folder",
+    default="inference_asr_model_valid.acc.ave_10best/devel/",
+    help="Directory inside exp_root containing inference on valid set",
+)
+parser.add_argument(
+    "--test_folder",
+    default="inference_asr_model_valid.acc.ave_10best/test/",
+    help="Directory inside exp_root containing inference on test set",
+)
+parser.add_argument(
+    "--utterance_test_folder",
+    default=None,
+    help="Directory inside exp_root containing inference on utterance test set",
+)
+
+args = parser.parse_args()
+
+exp_root = args.exp_root
+valid_inference_folder = args.valid_folder
+test_inference_folder = args.test_folder
+
+valid_hyp_file = open(
+    os.path.join(exp_root, valid_inference_folder + "score_wer/hyp.trn")
+)
+valid_ref_file = open(
+    os.path.join(exp_root, valid_inference_folder + "score_wer/ref.trn")
+)
+
+valid_hyp_write_file = open(
+    os.path.join(exp_root, valid_inference_folder + "score_wer/hyp_asr.trn"), "w"
+)
+valid_ref_write_file = open(
+    os.path.join(exp_root, valid_inference_folder + "score_wer/ref_asr.trn"), "w"
+)
+
+generate_asr_files(valid_hyp_file, valid_hyp_write_file)
+
+generate_asr_files(valid_ref_file, valid_ref_write_file)
+
+
+test_hyp_file = open(
+    os.path.join(exp_root, test_inference_folder + "score_wer/hyp.trn")
+)
+test_ref_file = open(
+    os.path.join(exp_root, test_inference_folder + "score_wer/ref.trn")
+)
+test_hyp_write_file = open(
+    os.path.join(exp_root, test_inference_folder + "score_wer/hyp_asr.trn"), "w"
+)
+test_ref_write_file = open(
+    os.path.join(exp_root, test_inference_folder + "score_wer/ref_asr.trn"), "w"
+)
+
+generate_asr_files(test_hyp_file, test_hyp_write_file)
+
+generate_asr_files(test_ref_file, test_ref_write_file)
+
+
+if args.utterance_test_folder is not None:
+    utt_test_inference_folder = args.utterance_test_folder
+    utt_test_hyp_file = open(
+        os.path.join(exp_root, utt_test_inference_folder + "score_wer/hyp.trn")
+    )
+    utt_test_ref_file = open(
+        os.path.join(exp_root, utt_test_inference_folder + "score_wer/ref.trn")
+    )
+    utt_test_hyp_write_file = open(
+        os.path.join(exp_root, utt_test_inference_folder + "score_wer/hyp_asr.trn"), "w"
+    )
+    utt_test_ref_write_file = open(
+        os.path.join(exp_root, utt_test_inference_folder + "score_wer/ref_asr.trn"), "w"
+    )
+    generate_asr_files(utt_test_hyp_file, utt_test_hyp_write_file)
+
+    generate_asr_files(utt_test_ref_file, utt_test_ref_write_file)
diff --git a/egs2/slue-voxceleb/asr1/local/run_spm.sh b/egs2/slue-voxceleb/asr1/local/run_spm.sh
index 1202a4942f7..9710cf5601e 100755
--- a/egs2/slue-voxceleb/asr1/local/run_spm.sh
+++ b/egs2/slue-voxceleb/asr1/local/run_spm.sh
@@ -2,7 +2,7 @@
 # It takes the data prepared using token type word as input
 # It then trains a bpe model with "nbpe" number of tokens on the train transcript i.e. text after first word (intent)
 # It then encodes the transcript for train, valid and test using the trained bpe model 
-nbpe=500 #try 100, 500, 1000
+nbpe=1000 #try 100, 500, 1000
 bpemode=bpe #try unigram, bpe
 
 new_data=data_${bpemode}_${nbpe}
diff --git a/egs2/slue-voxceleb/asr1/local/score.sh b/egs2/slue-voxceleb/asr1/local/score.sh
index 5b59d6fb3c5..e9c5e002a19 100755
--- a/egs2/slue-voxceleb/asr1/local/score.sh
+++ b/egs2/slue-voxceleb/asr1/local/score.sh
@@ -8,6 +8,9 @@
 # data=data/eval2000
 # #end configuration section.
 
+# TODO(siddhana): Automatically determine the decoding folder name
+# TODO(siddhana): Show SLU results in RESULTS.md
+
 [ -f ./path.sh ] && . ./path.sh
 . parse_options.sh || exit 1;
 
@@ -22,9 +25,29 @@ if [ $# -gt 1 ]; then
 	valid_inference_folder=$2
 	test_inference_folder=$3
 	python local/score.py --exp_root ${asr_expdir} --valid_folder ${valid_inference_folder} --test_folder ${test_inference_folder}
+	python local/generate_asr_files.py --exp_root ${asr_expdir} --valid_folder ${valid_inference_folder} --test_folder ${test_inference_folder}
+	python local/f1_score.py --exp_root ${asr_expdir} --valid_folder ${valid_inference_folder} --test_folder ${test_inference_folder}
 else
+	valid_inference_folder="inference_asr_model_valid.acc.ave_10best/devel/"
+	test_inference_folder="inference_asr_model_valid.acc.ave_10best/test/"
 	python local/score.py --exp_root ${asr_expdir}
+	python local/generate_asr_files.py --exp_root ${asr_expdir}
+	python local/f1_score.py --exp_root ${asr_expdir}
 fi
 
+sclite \
+            -r "${asr_expdir}/${valid_inference_folder}/score_wer/ref_asr.trn" trn \
+            -h "${asr_expdir}/${valid_inference_folder}/score_wer/hyp_asr.trn" trn \
+            -i rm -o all stdout > "${asr_expdir}/${valid_inference_folder}/score_wer/result_asr.txt"
+echo "Write ASR result in ${asr_expdir}/${valid_inference_folder}/score_wer/result_asr.txt"
+                grep -e Avg -e SPKR -m 2 "${asr_expdir}/${valid_inference_folder}/score_wer/result_asr.txt"
+
+sclite \
+            -r "${asr_expdir}/${test_inference_folder}/score_wer/ref_asr.trn" trn \
+            -h "${asr_expdir}/${test_inference_folder}/score_wer/hyp_asr.trn" trn \
+            -i rm -o all stdout > "${asr_expdir}/${test_inference_folder}/score_wer/result_asr.txt"
+echo "Write ASR result in ${asr_expdir}/${test_inference_folder}/score_wer/result_asr.txt"
+                grep -e Avg -e SPKR -m 2 "${asr_expdir}/${test_inference_folder}/score_wer/result_asr.txt"
+
 exit 0
 
diff --git a/egs2/slue-voxceleb/asr1/run.sh b/egs2/slue-voxceleb/asr1/run.sh
index 3ab3911cb4d..e70f5f0a6ba 100755
--- a/egs2/slue-voxceleb/asr1/run.sh
+++ b/egs2/slue-voxceleb/asr1/run.sh
@@ -18,6 +18,7 @@ asr_config=conf/train_asr.yaml
     --nbpe 5000 \
     --token_type word\
     --feats_type raw\
+    --gpu_inference true\
     --max_wav_duration 30 \
     --feats_normalize utterance_mvn\
     --inference_nj 8 \
diff --git a/egs2/slue-voxpopuli/asr1/asr.sh b/egs2/slue-voxpopuli/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/slue-voxpopuli/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/slue-voxpopuli/asr1/cmd.sh b/egs2/slue-voxpopuli/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/slue-voxpopuli/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/slue-voxpopuli/asr1/conf/decode_asr.yaml b/egs2/slue-voxpopuli/asr1/conf/decode_asr.yaml
new file mode 100644
index 00000000000..4a05b1fae3b
--- /dev/null
+++ b/egs2/slue-voxpopuli/asr1/conf/decode_asr.yaml
@@ -0,0 +1,6 @@
+beam_size: 10
+ctc_weight: 0.3
+lm_weight: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+penalty: 0.0
diff --git a/egs2/slue-voxpopuli/asr1/conf/fbank.conf b/egs2/slue-voxpopuli/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/slue-voxpopuli/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/slue-voxpopuli/asr1/conf/pbs.conf b/egs2/slue-voxpopuli/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/slue-voxpopuli/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/slue-voxpopuli/asr1/conf/pitch.conf b/egs2/slue-voxpopuli/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/slue-voxpopuli/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/slue-voxpopuli/asr1/conf/queue.conf b/egs2/slue-voxpopuli/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/slue-voxpopuli/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/slue-voxpopuli/asr1/conf/slurm.conf b/egs2/slue-voxpopuli/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/slue-voxpopuli/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/slue-voxpopuli/asr1/conf/train_asr.yaml b/egs2/slue-voxpopuli/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..183f14e01f9
--- /dev/null
+++ b/egs2/slue-voxpopuli/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+tuning_wavlm/train_asr_conformer_lr2e-3_warmup5k_wavlm_conv2d2.yaml
\ No newline at end of file
diff --git a/egs2/slue-voxpopuli/asr1/conf/tuning_wavlm/train_asr_conformer_lr2e-3_warmup5k_wavlm_conv2d2.yaml b/egs2/slue-voxpopuli/asr1/conf/tuning_wavlm/train_asr_conformer_lr2e-3_warmup5k_wavlm_conv2d2.yaml
new file mode 100644
index 00000000000..e14cfbcd2b6
--- /dev/null
+++ b/egs2/slue-voxpopuli/asr1/conf/tuning_wavlm/train_asr_conformer_lr2e-3_warmup5k_wavlm_conv2d2.yaml
@@ -0,0 +1,88 @@
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 27
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_ratio_range:
+    - 0.
+    - 0.05
+    num_time_mask: 2
+
+encoder: conformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d2
+    normalize_before: true
+    macaron_style: true
+    rel_pos_type: latest
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    use_cnn_module: true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: false means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend
+
+frontend: s3prl
+frontend_conf:
+    frontend_conf:
+        upstream: wavlm_large  # Note: If the upstream is changed, please change the input_size in the preencoder.
+    download_dir: ./hub
+    multilayer_feature: true
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 80
+
+freeze_param:
+    - "frontend.upstream"
+
+seed: 2022
+log_interval: 100
+num_workers: 2
+batch_type: numel
+batch_bins: 6000000
+accum_grad: 2
+max_epoch: 70
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+    lr: 0.002
+    weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 5000
diff --git a/egs2/slue-voxpopuli/asr1/db.sh b/egs2/slue-voxpopuli/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/slue-voxpopuli/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/slue-voxpopuli/asr1/local/data.sh b/egs2/slue-voxpopuli/asr1/local/data.sh
new file mode 100755
index 00000000000..99112c53e59
--- /dev/null
+++ b/egs2/slue-voxpopuli/asr1/local/data.sh
@@ -0,0 +1,56 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+
+stage=1
+stop_stage=100000
+log "$0 $*"
+. utils/parse_options.sh
+
+. ./db.sh
+. ./path.sh
+. ./cmd.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+if [ -z "${VOXPOPULI}" ]; then
+    log "Fill the value of 'VOXPOPULI' of db.sh"
+    exit 1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    if [ ! -e "${VOXPOPULI}/LICENSE.txt" ]; then
+	echo "stage 1: Download data to ${VOXPOPULI}"
+    else
+        log "stage 1: ${VOXPOPULI}/LICENSE.txt is already existing. Skip data downloading"
+    fi
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: Data Preparation"
+
+    mkdir -p data/{train,devel,test}
+    python3 local/data_prep_original_slue_format.py ${VOXPOPULI}
+    for x in test devel train; do
+        for f in text wav.scp utt2spk transcript; do
+            sort data/${x}/${f} -o data/${x}/${f}
+        done
+        utils/utt2spk_to_spk2utt.pl data/${x}/utt2spk > "data/${x}/spk2utt"
+        utils/validate_data_dir.sh --no-feats data/${x} || exit 1
+    done
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/slue-voxpopuli/asr1/local/data_prep_original_slue_format.py b/egs2/slue-voxpopuli/asr1/local/data_prep_original_slue_format.py
new file mode 100644
index 00000000000..005da336b83
--- /dev/null
+++ b/egs2/slue-voxpopuli/asr1/local/data_prep_original_slue_format.py
@@ -0,0 +1,122 @@
+#!/usr/bin/env python3
+
+import os
+import pandas as pd
+import re
+import sys
+
+if len(sys.argv) != 2:
+    print("Usage: python data_prep.py [root]")
+    sys.exit(1)
+root = sys.argv[1]
+
+dir_dict = {
+    "train": "slue-voxpopuli_fine-tune.tsv",
+    "devel": "slue-voxpopuli_dev.tsv",
+    "test": "slue-voxpopuli_test_blind.tsv",
+}
+
+ontonotes_to_combined_label = {
+    "GPE": "PLACE",
+    "LOC": "PLACE",
+    "CARDINAL": "QUANT",
+    "MONEY": "QUANT",
+    "ORDINAL": "QUANT",
+    "PERCENT": "QUANT",
+    "QUANTITY": "QUANT",
+    "ORG": "ORG",
+    "DATE": "WHEN",
+    "TIME": "WHEN",
+    "NORP": "NORP",
+    "PERSON": "PERSON",
+    "LAW": "LAW",
+}
+
+
+missing_count = 0
+missing_ent = set()
+
+for x in dir_dict:
+    with open(os.path.join("data", x, "text"), "w") as text_f, open(
+        os.path.join("data", x, "wav.scp"), "w"
+    ) as wav_scp_f, open(os.path.join("data", x, "utt2spk"), "w") as utt2spk_f:
+
+        text_f.truncate()
+        wav_scp_f.truncate()
+        utt2spk_f.truncate()
+        transcript_df = pd.read_csv(os.path.join(root, dir_dict[x]), sep="\t")
+        # lines = sorted(transcript_df.values, key=lambda s: s[0])
+        for row in transcript_df.values:
+            uttid = row[3] + "_" + row[0]
+            speaker = row[3]
+            if x == "train":
+                wav = "fine-tune/" + row[0] + ".ogg"
+            elif x == "devel":
+                wav = "dev/" + row[0] + ".ogg"
+            else:
+                wav = "test/" + row[0] + ".ogg"
+
+            transcript = row[2].lower()
+            entities = []
+            if x != "test":  # blind test set
+                if str(row[6]) != "None":
+                    for slot in row[6].split("], "):
+                        ent_type = (
+                            slot.split(",")[0]
+                            .replace("[", "")
+                            .replace("]", "")
+                            .replace('"', "")
+                            .replace("'", "")
+                        )
+                        if ent_type in ontonotes_to_combined_label:
+                            ent_type = ontonotes_to_combined_label[ent_type]
+                        else:
+                            missing_count += 1
+                            missing_ent.add(ent_type)
+                            continue
+                        fill_start = int(
+                            slot.split(",")[1]
+                            .replace("[", "")
+                            .replace("]", "")
+                            .replace('"', "")
+                            .replace("'", "")
+                            .replace(" ", "")
+                        )
+                        fill_len = int(
+                            slot.split(",")[2]
+                            .replace("[", "")
+                            .replace("]", "")
+                            .replace('"', "")
+                            .replace("'", "")
+                            .replace(" ", "")
+                        )
+                        filler = transcript[fill_start : fill_start + fill_len]
+                        entities.append(
+                            {
+                                "type": ent_type,
+                                "filler": filler,
+                                "filler_start": fill_start,
+                                "filler_end": fill_start + fill_len,
+                            }
+                        )
+            new_transcript = transcript[:]
+            for entity in entities:
+                new_transcript = (
+                    new_transcript[: entity["filler_start"]]
+                    + entity["type"]
+                    + " FILL "
+                    + entity["filler"].lower()
+                    + " SEP "
+                    + new_transcript[entity["filler_end"] :]
+                )
+
+            words = "{}".format(new_transcript).replace("<unk>", "unknown")
+            words = re.sub(r"[\.;?!]", "", words)
+            words = re.sub(r"\s+", " ", words)
+
+            text_f.write("{} {}\n".format(uttid, words))
+            utt2spk_f.write("{} {}\n".format(uttid, speaker))
+            wav_scp_f.write(f"{uttid} sox {os.path.join(root,wav)} -t wav -r 16k - |\n")
+
+print("Missing Entities", missing_ent)
+print("Missing Count", missing_count)
diff --git a/egs2/slue-voxpopuli/asr1/local/data_prep_original_slue_format_transcript.py b/egs2/slue-voxpopuli/asr1/local/data_prep_original_slue_format_transcript.py
new file mode 100644
index 00000000000..622c830e4cf
--- /dev/null
+++ b/egs2/slue-voxpopuli/asr1/local/data_prep_original_slue_format_transcript.py
@@ -0,0 +1,128 @@
+#!/usr/bin/env python3
+
+import os
+import pandas as pd
+import re
+import sys
+
+if len(sys.argv) != 2:
+    print("Usage: python data_prep.py [root]")
+    sys.exit(1)
+root = sys.argv[1]
+
+dir_dict = {
+    "train": "slue-voxpopuli_fine-tune.tsv",
+    "devel": "slue-voxpopuli_dev.tsv",
+    "test": "slue-voxpopuli_test_blind.tsv",
+}
+
+ontonotes_to_combined_label = {
+    "GPE": "PLACE",
+    "LOC": "PLACE",
+    "CARDINAL": "QUANT",
+    "MONEY": "QUANT",
+    "ORDINAL": "QUANT",
+    "PERCENT": "QUANT",
+    "QUANTITY": "QUANT",
+    "ORG": "ORG",
+    "DATE": "WHEN",
+    "TIME": "WHEN",
+    "NORP": "NORP",
+    "PERSON": "PERSON",
+    "LAW": "LAW",
+}
+
+
+missing_count = 0
+missing_ent = set()
+
+for x in dir_dict:
+    with open(os.path.join("data", x, "text"), "w") as text_f, open(
+        os.path.join("data", x, "transcript"), "w"
+    ) as transcript_f, open(os.path.join("data", x, "wav.scp"), "w") as wav_scp_f, open(
+        os.path.join("data", x, "utt2spk"), "w"
+    ) as utt2spk_f:
+
+        text_f.truncate()
+        wav_scp_f.truncate()
+        utt2spk_f.truncate()
+        transcript_df = pd.read_csv(os.path.join(root, dir_dict[x]), sep="\t")
+        # lines = sorted(transcript_df.values, key=lambda s: s[0])
+        for row in transcript_df.values:
+            uttid = row[3] + "_" + row[0]
+            speaker = row[3]
+            if x == "train":
+                wav = "fine-tune/" + row[0] + ".ogg"
+            elif x == "devel":
+                wav = "dev/" + row[0] + ".ogg"
+            else:
+                wav = "test/" + row[0] + ".ogg"
+
+            transcript = row[2].lower()
+            entities = []
+            if x != "test":  # blind test set
+                if str(row[6]) != "None":
+                    for slot in row[6].split("], "):
+                        ent_type = (
+                            slot.split(",")[0]
+                            .replace("[", "")
+                            .replace("]", "")
+                            .replace('"', "")
+                            .replace("'", "")
+                        )
+                        if ent_type in ontonotes_to_combined_label:
+                            ent_type = ontonotes_to_combined_label[ent_type]
+                        else:
+                            missing_count += 1
+                            missing_ent.add(ent_type)
+                            continue
+                        fill_start = int(
+                            slot.split(",")[1]
+                            .replace("[", "")
+                            .replace("]", "")
+                            .replace('"', "")
+                            .replace("'", "")
+                            .replace(" ", "")
+                        )
+                        fill_len = int(
+                            slot.split(",")[2]
+                            .replace("[", "")
+                            .replace("]", "")
+                            .replace('"', "")
+                            .replace("'", "")
+                            .replace(" ", "")
+                        )
+                        filler = transcript[fill_start : fill_start + fill_len]
+                        entities.append(
+                            {
+                                "type": ent_type,
+                                "filler": filler,
+                                "filler_start": fill_start,
+                                "filler_end": fill_start + fill_len,
+                            }
+                        )
+            new_transcript = transcript[:]
+            for entity in entities:
+                new_transcript = (
+                    new_transcript[: entity["filler_start"]]
+                    + entity["type"]
+                    + " FILL "
+                    + entity["filler"].lower()
+                    + " SEP "
+                    + new_transcript[entity["filler_end"] :]
+                )
+
+            words = "{}".format(new_transcript).replace("<unk>", "unknown")
+            words = re.sub(r"[\.;?!]", "", words)
+            words = re.sub(r"\s+", " ", words)
+
+            text_f.write("{} {}\n".format(uttid, words))
+            words = "{}".format(transcript).replace("<unk>", "unknown")
+            words = re.sub(r"[\.;?!]", "", words)
+            words = re.sub(r"\s+", " ", words)
+            transcript_f.write("{} {}\n".format(uttid, words))
+            utt2spk_f.write("{} {}\n".format(uttid, speaker))
+            wav_scp_f.write(f"{uttid} sox {os.path.join(root,wav)} -t wav -r 16k - |\n")
+
+print("Missing Entities", missing_ent)
+print("Missing Count", missing_count)
diff --git a/egs2/slue-voxpopuli/asr1/local/eval_utils.py b/egs2/slue-voxpopuli/asr1/local/eval_utils.py
new file mode 100644
index 00000000000..9310735eb56
--- /dev/null
+++ b/egs2/slue-voxpopuli/asr1/local/eval_utils.py
@@ -0,0 +1,152 @@
+from typing import List
+from collections import defaultdict
+import numpy as np
+import editdistance
+
+
+def get_ner_scores(all_gt, all_predictions):
+    """
+    Evalutes per-label and overall (micro and macro) metrics
+    of precision, recall, and fscore
+    Input:
+        all_gt/all_predictions:
+            List of list of tuples: (label, phrase, identifier)
+            Each list of tuples correspond to a sentence:
+                label: entity tag
+                phrase: entity phrase
+                tuple_identifier: identifier to differentiate
+                repeating (label, phrase) pairs
+    Returns:
+        Dictionary of metrics
+    Example:
+        List of GT (label, phrase) pairs of a sentence:
+        [(GPE, "eu"), (DATE, "today"), (GPE, "eu")]
+        all_gt: [(GPE, "eu", 0), (DATE, "today", 0), (GPE, "eu", 1)]
+    """
+    metrics = {}
+    stats = get_ner_stats(all_gt, all_predictions)
+    num_correct, num_gt, num_pred = 0, 0, 0
+    prec_lst, recall_lst, fscore_lst = [], [], []
+    for tag_name, tag_stats in stats.items():
+        precision, recall, fscore = get_metrics(
+            np.sum(tag_stats["tp"]),
+            np.sum(tag_stats["gt_cnt"]),
+            np.sum(tag_stats["pred_cnt"]),
+        )
+        _ = metrics.setdefault(tag_name, {})
+        metrics[tag_name]["precision"] = precision
+        metrics[tag_name]["recall"] = recall
+        metrics[tag_name]["fscore"] = fscore
+
+        num_correct += np.sum(tag_stats["tp"])
+        num_pred += np.sum(tag_stats["pred_cnt"])
+        num_gt += np.sum(tag_stats["gt_cnt"])
+
+        prec_lst.append(precision)
+        recall_lst.append(recall)
+        fscore_lst.append(fscore)
+
+    precision, recall, fscore = get_metrics(num_correct, num_gt, num_pred)
+    metrics["overall_micro"] = {}
+    metrics["overall_micro"]["precision"] = precision
+    metrics["overall_micro"]["recall"] = recall
+    metrics["overall_micro"]["fscore"] = fscore
+
+    metrics["overall_macro"] = {}
+    metrics["overall_macro"]["precision"] = np.mean(prec_lst)
+    metrics["overall_macro"]["recall"] = np.mean(recall_lst)
+    metrics["overall_macro"]["fscore"] = np.mean(fscore_lst)
+
+    return metrics
+
+
+def get_ner_stats(all_gt, all_predictions):
+    stats = {}
+    cnt = 0
+    for gt, pred in zip(all_gt, all_predictions):
+        entities_true = defaultdict(set)
+        entities_pred = defaultdict(set)
+        for type_name, entity_info1, entity_info2 in gt:
+            entities_true[type_name].add((entity_info1, entity_info2))
+        for type_name, entity_info1, entity_info2 in pred:
+            entities_pred[type_name].add((entity_info1, entity_info2))
+        target_names = sorted(set(entities_true.keys()) | set(entities_pred.keys()))
+        for tag_name in target_names:
+            _ = stats.setdefault(tag_name, {})
+            _ = stats[tag_name].setdefault("tp", [])
+            _ = stats[tag_name].setdefault("gt_cnt", [])
+            _ = stats[tag_name].setdefault("pred_cnt", [])
+            entities_true_type = entities_true.get(tag_name, set())
+            entities_pred_type = entities_pred.get(tag_name, set())
+            stats[tag_name]["tp"].append(len(entities_true_type & entities_pred_type))
+            stats[tag_name]["pred_cnt"].append(len(entities_pred_type))
+            stats[tag_name]["gt_cnt"].append(len(entities_true_type))
+    return stats
+
+
+def safe_divide(numerator, denominator):
+    numerator = np.array(numerator)
+    denominator = np.array(denominator)
+    mask = denominator == 0.0
+    denominator = denominator.copy()
+    denominator[mask] = 1  # avoid infs/nans
+    return numerator / denominator
+
+
+def ner_error_analysis(all_gt, all_predictions, gt_text):
+    """
+    Print out predictions and GT
+    all_gt: [GT] list of tuples of (label, phrase, identifier idx)
+    all_predictions: [hypothesis] list of tuples of (label, phrase, identifier idx)
+    gt_text: list of GT text sentences
+    """
+    analysis_examples_dct = {}
+    analysis_examples_dct["all"] = []
+    for idx, text in enumerate(gt_text):
+        if isinstance(text, list):
+            text = " ".join(text)
+        gt = all_gt[idx]
+        pred = all_predictions[idx]
+        entities_true = defaultdict(set)
+        entities_pred = defaultdict(set)
+        for type_name, entity_info1, entity_info2 in gt:
+            entities_true[type_name].add((entity_info1, entity_info2))
+        for type_name, entity_info1, entity_info2 in pred:
+            entities_pred[type_name].add((entity_info1, entity_info2))
+        target_names = sorted(set(entities_true.keys()) | set(entities_pred.keys()))
+        analysis_examples_dct["all"].append("\t".join([text, str(gt), str(pred)]))
+        for tag_name in target_names:
+            _ = analysis_examples_dct.setdefault(tag_name, [])
+            entities_true_type = entities_true.get(tag_name, set())
+            entities_pred_type = entities_pred.get(tag_name, set())
+            num_gt = len(entities_true_type)
+            num_correct = len(entities_true_type & entities_pred_type)
+
+            new_gt = [(item1, item2) for item1, item2, _ in gt]
+            new_pred = [(item1, item2) for item1, item2, _ in pred]
+            analysis_examples_dct[tag_name].append(
+                "\t".join([text, str(new_gt), str(new_pred)])
+            )
+
+    return analysis_examples_dct
+
+
+def get_metrics(num_correct, num_gt, num_pred):
+    precision = safe_divide([num_correct], [num_pred])
+    recall = safe_divide([num_correct], [num_gt])
+    fscore = safe_divide([2 * precision * recall], [(precision + recall)])
+    return precision[0], recall[0], fscore[0][0]
+
+
+def get_wer(refs: List[str], hyps: List[str]):
+    """
+    args:
+        refs (list of str): reference texts
+        hyps (list of str): hypothesis/prediction texts
+    """
+    n_words, n_errors = 0, 0
+    for ref, hyp in zip(refs, hyps):
+        ref, hyp = ref.split(), hyp.split()
+        n_words += len(ref)
+        n_errors += editdistance.eval(ref, hyp)
+    return safe_divide(n_errors, n_words)
diff --git a/egs2/slue-voxpopuli/asr1/local/path.sh b/egs2/slue-voxpopuli/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/slue-voxpopuli/asr1/local/run_spm.sh b/egs2/slue-voxpopuli/asr1/local/run_spm.sh
new file mode 100755
index 00000000000..9710cf5601e
--- /dev/null
+++ b/egs2/slue-voxpopuli/asr1/local/run_spm.sh
@@ -0,0 +1,38 @@
+# This script is called in data preparation step by local/data.sh
+# It takes the data prepared using token type word as input
+# It then trains a bpe model with "nbpe" number of tokens on the train transcript i.e. text after first word (intent)
+# It then encodes the transcript for train, valid and test using the trained bpe model 
+nbpe=1000 #try 100, 500, 1000
+bpemode=bpe #try unigram, bpe
+
+new_data=data_${bpemode}_${nbpe}
+dict=${new_data}/en_token_list/word/tokens.txt
+bpemodel=${new_data}/spm_train_${bpemode}${nbpe}
+
+cp -R data ${new_data}
+
+cut -d' ' -f2 data/train/text | sort | uniq > ${new_data}/intents.txt
+cut -d' ' -f3- data/train/text > ${new_data}/input.txt
+
+spm_train --input=${new_data}/input.txt \
+            --model_prefix=${bpemodel} \
+            --vocab_size=${nbpe} \
+            --character_coverage=1.0 \
+            --model_type=${bpemode} \
+            --model_prefix=${bpemodel} \
+            --input_sentence_size=100000000 \
+            --bos_id=-1 \
+            --eos_id=-1 \
+            --unk_id=0 
+
+for split in train devel test; do 
+    cut -d' ' -f-2 data/${split}/text > ${new_data}/tmp_${split}_utt
+    cut -d' ' -f3- data/${split}/text > ${new_data}/tmp_${split}_transcript
+    spm_encode --model=${bpemodel}.model --output_format=piece < ${new_data}/tmp_${split}_transcript > ${new_data}/new_${split}_transcript
+    paste -d' ' ${new_data}/tmp_${split}_utt ${new_data}/new_${split}_transcript > ${new_data}/${split}/text
+    rm ${new_data}/tmp_${split}_utt
+    rm ${new_data}/tmp_${split}_transcript
+    rm ${new_data}/new_${split}_transcript
+done
+
+#| awk '{print $0 " " NR+1}' >> ${dict}
diff --git a/egs2/slue-voxpopuli/asr1/local/score.py b/egs2/slue-voxpopuli/asr1/local/score.py
new file mode 100755
index 00000000000..4239ed2b151
--- /dev/null
+++ b/egs2/slue-voxpopuli/asr1/local/score.py
@@ -0,0 +1,232 @@
+#!/usr/bin/env bash
+
+# Copyright 2021  Siddhant Arora, Yifan Peng
+#           2021  Carnegie Mellon University
+# Apache 2.0
+
+import json
+import os
+import re
+import sys
+import pandas as pd
+import argparse
+
+import eval_utils
+
+ontonotes_to_combined_label = {
+    "GPE": "PLACE",
+    "LOC": "PLACE",
+    "CARDINAL": "QUANT",
+    "MONEY": "QUANT",
+    "ORDINAL": "QUANT",
+    "PERCENT": "QUANT",
+    "QUANTITY": "QUANT",
+    "ORG": "ORG",
+    "DATE": "WHEN",
+    "TIME": "WHEN",
+    "NORP": "NORP",
+    "PERSON": "PERSON",
+    "LAW": "LAW",
+}
+combined_labels = set(ontonotes_to_combined_label.values())
+special_tokens = list(combined_labels) + ["FILL", "SEP"]
+
+
+def preprocess_sentence(line):
+    # Ensure special tokens are never merged with others
+    for label in special_tokens:
+        line = line.replace(label, "▁" + label + "▁")
+
+    line = line.strip().replace(" ", "").replace("▁", " ")
+    line = re.sub(" +", " ", line).strip()  # remove consecutive spaces
+
+    # Also return the sentence without special tokens
+    valid_tokens = []
+    for token in line.split():
+        if token not in special_tokens:
+            valid_tokens.append(token)
+    valid_line = " ".join(valid_tokens)
+    valid_line = valid_line.replace(" 's", "'s")  # combine 's with the previous word
+
+    return line, valid_line
+
+
+def make_distinct(label_lst):
+    """
+    Make the label_lst distinct
+    """
+    tag2cnt, new_tag_lst = {}, []
+    if len(label_lst) > 0:
+        for tag_item in label_lst:
+            _ = tag2cnt.setdefault(tag_item, 0)
+            tag2cnt[tag_item] += 1
+            tag, wrd = tag_item
+            new_tag_lst.append((tag, wrd, tag2cnt[tag_item]))
+        assert len(new_tag_lst) == len(set(new_tag_lst))
+    return new_tag_lst
+
+
+def process_line(line, label_F1=False):
+    label_lst = []
+    line = line.replace("  ", " ")
+    wrd_lst = line.split(" ")
+    phrase_lst, is_entity, num_illegal_assigments = [], False, 0
+    for idx, wrd in enumerate(wrd_lst):
+        if wrd in combined_labels:
+            if is_entity:
+                phrase_lst = []
+                num_illegal_assigments += 1
+            is_entity = True
+            entity_tag = wrd
+        elif wrd == "SEP":
+            if is_entity:
+                if len(phrase_lst) > 0:
+                    phrase_lst.remove("FILL")
+                    if label_F1 is True:
+                        label_lst.append((entity_tag, "phrase"))
+                    else:
+                        label_lst.append((entity_tag, " ".join(phrase_lst)))
+                else:
+                    num_illegal_assignments += 1
+                phrase_lst = []
+                is_entity = False
+            else:
+                num_illegal_assigments += 1
+
+        else:
+            if is_entity:
+                phrase_lst.append(wrd)
+
+    return make_distinct(label_lst)
+
+
+def get_classification_result(hyp_file, ref_file, hyp_asr_file, ref_asr_file):
+    hyp_lines = [line for line in hyp_file]
+    ref_lines = [line for line in ref_file]
+
+    hyp_list = []
+    ref_list = []
+
+    hyp_asr_list = []
+    ref_asr_list = []
+
+    hyp_label_list = []
+    ref_label_list = []
+
+    for line_count in range(len(hyp_lines)):
+        hyp_tokens = hyp_lines[line_count].split()
+        ref_tokens = ref_lines[line_count].split()
+
+        # The last "word" is utt_id
+        hyp_id = hyp_tokens[-1]
+        ref_id = ref_tokens[-1]
+        assert hyp_id == ref_id, f"hyp_id: {hyp_id}, ref_id: {ref_id}"
+
+        # Remove utt_id
+        hyp_line = " ".join(hyp_tokens[:-1])
+        ref_line = " ".join(ref_tokens[:-1])
+
+        # De-tokenize
+        hyp_line, hyp_line_asr = preprocess_sentence(hyp_line)
+        ref_line, ref_line_asr = preprocess_sentence(ref_line)
+
+        # Save results for computing F1
+        hyp_list.append(process_line(hyp_line))
+        ref_list.append(process_line(ref_line))
+
+        # Pure ASR text without special tokens
+        hyp_asr_list.append(hyp_line_asr + "\t" + hyp_id)
+        ref_asr_list.append(ref_line_asr + "\t" + ref_id)
+
+        # Save results for computing label-F1
+        hyp_label_list.append(process_line(hyp_line, label_F1=True))
+        ref_label_list.append(process_line(ref_line, label_F1=True))
+
+    # NER F1 score
+    metrics = eval_utils.get_ner_scores(hyp_list, ref_list)
+
+    # Write ASR text for computing WER later
+    for ln in hyp_asr_list:
+        hyp_asr_file.write(ln + "\n")
+    for ln in ref_asr_list:
+        ref_asr_file.write(ln + "\n")
+
+    # NER label-F1 score
+    label_metrics = eval_utils.get_ner_scores(hyp_label_list, ref_label_list)
+
+    return metrics, label_metrics
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--exp_root", required=True, help="Directory to save experiments")
+parser.add_argument(
+    "--valid_folder",
+    default="decode_asr_asr_model_valid.acc.ave/devel/",
+    help="Directory inside exp_root containing inference on valid set",
+)
+parser.add_argument(
+    "--test_folder",
+    default="decode_asr_asr_model_valid.acc.ave/test/",
+    help="Directory inside exp_root containing inference on test set",
+)
+args = parser.parse_args()
+
+exp_root = args.exp_root
+valid_inference_folder = args.valid_folder
+test_inference_folder = args.test_folder
+
+# Read original tokenized text
+valid_hyp_file = open(
+    os.path.join(exp_root, valid_inference_folder + "score_ter/hyp.trn")
+)
+valid_ref_file = open(
+    os.path.join(exp_root, valid_inference_folder + "score_ter/ref.trn")
+)
+
+# Write detokenized text
+valid_hyp_asr_file = open(
+    os.path.join(exp_root, valid_inference_folder + "score_ter/hyp_asr.trn"), "w"
+)
+valid_ref_asr_file = open(
+    os.path.join(exp_root, valid_inference_folder + "score_ter/ref_asr.trn"), "w"
+)
+
+result, label_result = get_classification_result(
+    valid_hyp_file, valid_ref_file, valid_hyp_asr_file, valid_ref_asr_file
+)
+print("Valid F1:")
+print(json.dumps(result, indent=4))
+print()
+print("Valid label-F1:")
+print(json.dumps(label_result, indent=4))
+print()
+
+
+if os.path.isdir(test_inference_folder):
+    # Read files
+    test_hyp_file = open(
+        os.path.join(exp_root, test_inference_folder + "score_ter/hyp.trn")
+    )
+    test_ref_file = open(
+        os.path.join(exp_root, test_inference_folder + "score_ter/ref.trn")
+    )
+
+    # Write files
+    test_hyp_asr_file = open(
+        os.path.join(exp_root, test_inference_folder + "score_ter/hyp_asr.trn"), "w"
+    )
+    test_ref_asr_file = open(
+        os.path.join(exp_root, test_inference_folder + "score_ter/ref_asr.trn"), "w"
+    )
+
+    result, label_result = get_classification_result(
+        test_hyp_file, test_ref_file, test_hyp_asr_file, test_ref_asr_file
+    )
+    print("Test F1:")
+    print(json.dumps(result, indent=4))
+    print()
+    print("Test label-F1:")
+    print(json.dumps(label_result, indent=4))
+    print()
+else:
+    print("[Warning] Skip F1 and label-F1 on test set as it does not exist.\n")
diff --git a/egs2/slue-voxpopuli/asr1/local/score.sh b/egs2/slue-voxpopuli/asr1/local/score.sh
new file mode 100755
index 00000000000..39e6661e841
--- /dev/null
+++ b/egs2/slue-voxpopuli/asr1/local/score.sh
@@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+# Copyright 2021  Siddhant Arora
+#           2021  Carnegie Mellon University
+
+# # begin configuration section.
+# cmd=run.pl
+# stage=0
+# data=data/eval2000
+# #end configuration section.
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -lt 1 ]; then
+  echo "Usage: local/score.sh <asr-exp-dir> <valid_inference_folder> <test_inference_folder>"
+  exit 1;
+fi
+. ./db.sh
+
+asr_expdir=$1
+
+if [ $# -gt 1 ]; then
+	valid_inference_folder=$2
+	test_inference_folder=$3
+	python local/score.py --exp_root ${asr_expdir} --valid_folder ${valid_inference_folder} --test_folder ${test_inference_folder}
+else
+	valid_inference_folder=decode_asr_asr_model_valid.acc.ave/devel
+	test_inference_folder=decode_asr_asr_model_valid.acc.ave/test
+	python local/score.py --exp_root ${asr_expdir}
+fi
+
+sclite \
+    -r "${asr_expdir}/${valid_inference_folder}/score_ter/ref_asr.trn" trn \
+    -h "${asr_expdir}/${valid_inference_folder}/score_ter/hyp_asr.trn" trn \
+    -i rm -o all stdout > "${asr_expdir}/${valid_inference_folder}/score_ter/result_asr.txt"
+echo "Write ASR result in ${asr_expdir}/${valid_inference_folder}/score_ter/result_asr.txt"
+grep -e Avg -e SPKR -m 2 "${asr_expdir}/${valid_inference_folder}/score_ter/result_asr.txt"
+
+if [ -d "${test_inference_folder}" ]; then
+	sclite \
+		-r "${asr_expdir}/${test_inference_folder}/score_ter/ref_asr.trn" trn \
+		-h "${asr_expdir}/${test_inference_folder}/score_ter/hyp_asr.trn" trn \
+		-i rm -o all stdout > "${asr_expdir}/${test_inference_folder}/score_ter/result_asr.txt"
+	echo "Write ASR result in ${asr_expdir}/${test_inference_folder}/score_ter/result_asr.txt"
+	grep -e Avg -e SPKR -m 2 "${asr_expdir}/${test_inference_folder}/score_ter/result_asr.txt"
+else
+	echo "[Warning] Skip ASR result on test set as it does not exist."
+fi
+
+exit 0
diff --git a/egs2/slue-voxpopuli/asr1/path.sh b/egs2/slue-voxpopuli/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/slue-voxpopuli/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/slue-voxpopuli/asr1/pyscripts b/egs2/slue-voxpopuli/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/slue-voxpopuli/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/slue-voxpopuli/asr1/run.sh b/egs2/slue-voxpopuli/asr1/run.sh
new file mode 100755
index 00000000000..c3f8f2d9c90
--- /dev/null
+++ b/egs2/slue-voxpopuli/asr1/run.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set="train"
+valid_set="devel"
+test_sets="test devel"
+
+asr_config=conf/tuning_wavlm/train_asr_conformer_lr2e-3_warmup5k_wavlm_conv2d2.yaml
+inference_config=conf/decode_asr.yaml
+
+./asr.sh \
+    --lang en \
+    --ngpu 1 \
+    --use_lm false \
+    --token_type bpe \
+    --nbpe 1000 \
+    --bpe_nlsyms FILL,SEP,PLACE,QUANT,ORG,WHEN,NORP,PERSON,LAW \
+    --feats_type raw \
+    --audio_format "flac.ark" \
+    --speed_perturb_factors "0.9 1.0 1.1" \
+    --max_wav_duration 30 \
+    --feats_normalize utterance_mvn \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --lm_train_text "data/${train_set}/text" \
+    --bpe_train_text "data/${train_set}/text" "$@"
diff --git a/egs2/slue-voxpopuli/asr1/scripts b/egs2/slue-voxpopuli/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/slue-voxpopuli/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/slue-voxpopuli/asr1/steps b/egs2/slue-voxpopuli/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/slue-voxpopuli/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/slue-voxpopuli/asr1/utils b/egs2/slue-voxpopuli/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/slue-voxpopuli/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/slurp/asr1/conf/train_asr_streaming_transformer.yaml b/egs2/slurp/asr1/conf/train_asr_streaming_transformer.yaml
new file mode 100644
index 00000000000..939e8d9bdd8
--- /dev/null
+++ b/egs2/slurp/asr1/conf/train_asr_streaming_transformer.yaml
@@ -0,0 +1,69 @@
+# network architecture
+# encoder related
+encoder: contextual_block_transformer
+encoder_conf:
+    output_size: 512    # dimension of attention
+    attention_heads: 8
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+    block_size: 40
+    hop_size: 16
+    look_ahead: 16
+    init_average: true
+    ctx_pos_enc: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.5
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false
+
+# minibatch related
+batch_type: folded
+batch_size: 64
+max_epoch: 50
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+    lr: 0.0002
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
+num_att_plot: 0
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/swbd_da/asr1/README.md b/egs2/swbd_da/asr1/README.md
index 1a7911c9c87..be6ed3ec93d 100644
--- a/egs2/swbd_da/asr1/README.md
+++ b/egs2/swbd_da/asr1/README.md
@@ -1,6 +1,18 @@
 <!-- Generated by scripts/utils/show_asr_result.sh -->
 # RESULTS
 
+## Using Conformer encoder with Hubert pre-encoder, SpecAugment, speed perturbation and 3 context utterances
+
+- ASR config: [conf/tuning/train_asr_conformer_hubert.yaml](conf/tuning/train_asr_conformer_hubert.yaml)
+- Pretrained model:
+  - Zenodo: https://zenodo.org/record/5817199#.YdQ9_YTMKkA
+  - Hugging Face Hub: https://huggingface.co/espnet/akreal_swbd_da_hubert_conformer
+
+|Dataset|Snt|Dialogue Act Classification (%)|
+|---|---|---|
+|decode_asr_asr_model_valid.loss.ave/test|2379|66.3|
+|decode_asr_asr_model_valid.loss.ave/valid|8117|69.5|
+
 ## Using Conformer encoder, SpecAugment and speed perturbation
 
 - ASR config: [conf/tuning/train_asr_conformer.yaml](conf/tuning/train_asr_conformer.yaml)
diff --git a/egs2/swbd_da/asr1/conf/train_asr.yaml b/egs2/swbd_da/asr1/conf/train_asr.yaml
index 10fea4f0971..1f20a2c51d7 120000
--- a/egs2/swbd_da/asr1/conf/train_asr.yaml
+++ b/egs2/swbd_da/asr1/conf/train_asr.yaml
@@ -1 +1 @@
-tuning/train_asr_conformer.yaml
\ No newline at end of file
+tuning/train_asr_conformer_hubert.yaml
\ No newline at end of file
diff --git a/egs2/swbd_da/asr1/conf/tuning/train_asr_conformer.yaml b/egs2/swbd_da/asr1/conf/tuning/train_asr_conformer.yaml
index 6e54485ff75..c0b803eba9a 100644
--- a/egs2/swbd_da/asr1/conf/tuning/train_asr_conformer.yaml
+++ b/egs2/swbd_da/asr1/conf/tuning/train_asr_conformer.yaml
@@ -8,7 +8,7 @@ best_model_criterion:
 -   - valid
     - loss
     - min
-keep_nbest_models: 2
+keep_nbest_models: 4
 
 encoder: conformer
 encoder_conf:
diff --git a/egs2/swbd_da/asr1/conf/tuning/train_asr_conformer_hubert.yaml b/egs2/swbd_da/asr1/conf/tuning/train_asr_conformer_hubert.yaml
new file mode 100644
index 00000000000..adc8ca8001e
--- /dev/null
+++ b/egs2/swbd_da/asr1/conf/tuning/train_asr_conformer_hubert.yaml
@@ -0,0 +1,82 @@
+batch_type: numel
+batch_bins: 4000000
+accum_grad: 1
+max_epoch: 35
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 7
+
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.0
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+optim: adam
+optim_conf:
+    lr: 0.0001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 25000
+
+freeze_param: [
+"frontend.upstream"
+]
+
+frontend: s3prl
+frontend_conf:
+    frontend_conf:
+        upstream: hubert_large_ll60k  # Note: If the upstream is changed, please change the input_size in the preencoder.
+    download_dir: ./hub
+    multilayer_feature: True
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 80
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/swbd_da/asr1/local/data.sh b/egs2/swbd_da/asr1/local/data.sh
index 55d63b4a984..913aa4a0b7b 100755
--- a/egs2/swbd_da/asr1/local/data.sh
+++ b/egs2/swbd_da/asr1/local/data.sh
@@ -14,6 +14,7 @@ SECONDS=0
 
 stage=1
 stop_stage=100000
+context=0
 log "$0 $*"
 . utils/parse_options.sh
 
@@ -33,11 +34,7 @@ fi
 
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     log "stage 1: Data Preparation"
-    mkdir -p data/{train,valid,test}
-    python3 local/data_prep.py ${SWBD}/LDC97S62 ${SWBD_NXT}/LDC2009T26
-    for x in test valid train; do
-        utils/fix_data_dir.sh data/${x} || exit 1
-    done
+    python3 local/data_prep.py --context ${context} ${SWBD}/LDC97S62 ${SWBD_NXT}/LDC2009T26
 fi
 
 log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/swbd_da/asr1/local/data_prep.py b/egs2/swbd_da/asr1/local/data_prep.py
index f4e5fb835b6..57bbc138225 100644
--- a/egs2/swbd_da/asr1/local/data_prep.py
+++ b/egs2/swbd_da/asr1/local/data_prep.py
@@ -3,23 +3,31 @@
 # Copyright 2021 University of Stuttgart (Pavel Denisov)
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 
+import argparse
 import glob
 import os
-import sys
+import subprocess
 import xml.etree.ElementTree as ET
 
-if len(sys.argv) != 3:
-    print("Usage: python data_prep.py <LDC97S62 path> <LDC2009T26 path>")
-    sys.exit(1)
+parser = argparse.ArgumentParser(
+    description="Prepare Switchboard Dialogue Act dataset."
+)
 
-audio_root = sys.argv[1]
-nxt_root = os.path.join(sys.argv[2], "nxt_switchboard_ann", "xml")
+parser.add_argument("audio_path", type=str, help="Path to audio (LDC97S62)")
+parser.add_argument("nxt_path", type=str, help="Path to NXT annotation (LDC2009T26)")
+parser.add_argument(
+    "--context", type=int, default=0, help="Number of utterances in the context"
+)
+
+args = parser.parse_args()
+
+xml_path = os.path.join(args.nxt_path, "nxt_switchboard_ann", "xml")
 
 channel = {"A": 1, "B": 2}
 speaker = {}
 
 corpus_resources_root = ET.parse(
-    os.path.join(nxt_root, "corpus-resources", "dialogues.xml")
+    os.path.join(xml_path, "corpus-resources", "dialogues.xml")
 ).getroot()
 for dialogue in corpus_resources_root.findall(".//dialogue"):
     dialogue_id = "sw" + dialogue.attrib["swbdid"]
@@ -31,7 +39,7 @@
 
 sph = {}
 
-for sph_file in glob.glob(os.path.join(audio_root, "*/swb1/sw*.sph")):
+for sph_file in glob.glob(os.path.join(args.audio_path, "*/swb1/sw*.sph")):
     dialogue_id = sph_file[-10:-4]
     sph[dialogue_id] = sph_file
 
@@ -41,21 +49,29 @@
 # NAACL 2016. (* indicates equal contribution)
 
 for subset in ["train", "valid", "test"]:
-    with open(os.path.join("data", subset, "text"), "w") as text_f, open(
-        os.path.join("data", subset, "wav.scp"), "w"
-    ) as wav_scp_f, open(
-        os.path.join("data", subset, "utt2spk"), "w"
-    ) as utt2spk_f, open(
+    subset_dir = subset
+
+    if args.context > 0:
+        subset_dir += "_context" + str(args.context)
+
+    odir = os.path.join("data", subset_dir)
+    os.makedirs(odir, exist_ok=True)
+
+    with open(os.path.join(odir, "text"), "w") as text_f, open(
+        os.path.join(odir, "wav.scp"), "w"
+    ) as wav_scp_f, open(os.path.join(odir, "utt2spk"), "w") as utt2spk_f, open(
         os.path.join("local", subset + ".lst")
     ) as dialogues_f:
         for line in dialogues_f:
             dialogue_id = line.strip()
 
+            dial_acts = {}
+
             for role in ["A", "B"]:
                 terminals = {}
 
                 terminals_file = os.path.join(
-                    nxt_root, "terminals", f"{dialogue_id}.{role}.terminals.xml"
+                    xml_path, "terminals", f"{dialogue_id}.{role}.terminals.xml"
                 )
 
                 if not os.path.exists(terminals_file):
@@ -80,7 +96,7 @@
 
                 dial_act_root = ET.parse(
                     os.path.join(
-                        nxt_root, "dialAct", f"{dialogue_id}.{role}.dialAct.xml"
+                        xml_path, "dialAct", f"{dialogue_id}.{role}.dialAct.xml"
                     )
                 ).getroot()
 
@@ -89,12 +105,16 @@
                     if words == "excluded":
                         continue
 
+                    dial_act_id = dial_act.attrib["{http://nite.sourceforge.net/}id"][
+                        2:
+                    ]
+
                     utt_id = (
                         speaker[dialogue_id][role]
                         + "_"
                         + dialogue_id
                         + "_"
-                        + dial_act.attrib["{http://nite.sourceforge.net/}id"][2:]
+                        + dial_act_id
                     )
 
                     dial_act_children = dial_act.findall(
@@ -116,15 +136,53 @@
 
                     start = terminals[start_terminal_id]["start"]
                     end = terminals[end_terminal_id]["end"]
+                    dur = end - start
 
-                    text_f.write(utt_id + " " + words + "\n")
+                    if dur < 0.005:
+                        continue
 
-                    wav_scp_f.write(
-                        "{} sox {} -r 16k -t wav -c 1 -b 16 -e signed - ".format(
-                            utt_id, sph[dialogue_id]
-                        )
-                        + "trim {} {} remix {} |\n".format(
-                            start, end - start, channel[role]
-                        )
-                    )
-                    utt2spk_f.write(utt_id + " " + speaker[dialogue_id][role] + "\n")
+                    dial_acts[dial_act_id] = {
+                        "utt": utt_id,
+                        "start": start,
+                        "dur": dur,
+                        "channel": channel[role],
+                        "text": words,
+                        "spk": speaker[dialogue_id][role],
+                    }
+
+            for dial_act_id in dial_acts:
+                context = [dial_act_id]
+
+                for i in range(1, args.context + 1):
+                    context_dial_act_id = str(int(dial_act_id) - i)
+                    if context_dial_act_id in dial_acts:
+                        context.append(context_dial_act_id)
+
+                context.reverse()
+
+                wav = " ".join(
+                    [
+                        f'"| sox {sph[dialogue_id]} -r 16k -t wav'
+                        + " -c 1 -b 16 -e signed - "
+                        + f'trim {dial_acts[c]["start"]} {dial_acts[c]["dur"]}'
+                        + f' remix {dial_acts[c]["channel"]}"'
+                        for c in context
+                    ]
+                )
+
+                wav_scp_f.write(
+                    "{} sox {} -t wav - |\n".format(dial_acts[dial_act_id]["utt"], wav)
+                )
+                text_f.write(
+                    dial_acts[dial_act_id]["utt"]
+                    + " "
+                    + dial_acts[dial_act_id]["text"]
+                    + "\n"
+                )
+                utt2spk_f.write(
+                    dial_acts[dial_act_id]["utt"]
+                    + " "
+                    + dial_acts[dial_act_id]["spk"]
+                    + "\n"
+                )
+    subprocess.call("utils/fix_data_dir.sh {}".format(odir), shell=True)
diff --git a/egs2/swbd_da/asr1/local/score.py b/egs2/swbd_da/asr1/local/score.py
new file mode 120000
index 00000000000..32c1318cf74
--- /dev/null
+++ b/egs2/swbd_da/asr1/local/score.py
@@ -0,0 +1 @@
+../../../TEMPLATE/asr1/pyscripts/utils/score_intent.py
\ No newline at end of file
diff --git a/egs2/swbd_da/asr1/local/score.sh b/egs2/swbd_da/asr1/local/score.sh
new file mode 100755
index 00000000000..5fd40c0e21c
--- /dev/null
+++ b/egs2/swbd_da/asr1/local/score.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+
+# Copyright 2022 University of Stuttgart (Pavel Denisov)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -lt 1 ]; then
+  echo "Usage: local/score.sh <asr-exp-dir> <valid_inference_folder> <test_inference_folder>"
+  exit 1;
+fi
+
+asr_expdir=$1
+if [ $# -gt 1 ]; then
+        valid_inference_folder=$2
+        test_inference_folder=$3
+else
+        valid_inference_folder="decode_asr_asr_model_valid.loss.ave/valid_context3/"
+        test_inference_folder="decode_asr_asr_model_valid.loss.ave/test_context3/"
+fi
+
+python local/score.py \
+		--exp_root ${asr_expdir} \
+		--valid_folder ${valid_inference_folder} \
+		--test_folder ${test_inference_folder} \
+	| sed 's/Intent/Dialog Act/g'
+
+exit 0
+
diff --git a/egs2/swbd_da/asr1/run.sh b/egs2/swbd_da/asr1/run.sh
index 81ec5daf20b..4e68f250ecd 100755
--- a/egs2/swbd_da/asr1/run.sh
+++ b/egs2/swbd_da/asr1/run.sh
@@ -5,11 +5,12 @@ set -e
 set -u
 set -o pipefail
 
-train_set="train"
-valid_set="valid"
-test_sets="test valid"
+context=3
+train_set="train_context${context}"
+valid_set="valid_context${context}"
+test_sets="test_context${context} valid_context${context}"
 
-asr_config=conf/train_asr.yaml
+asr_config="conf/train_asr.yaml"
 inference_config=conf/decode_asr.yaml
 
 ./asr.sh \
@@ -22,6 +23,10 @@ inference_config=conf/decode_asr.yaml
     --asr_config "${asr_config}" \
     --inference_config "${inference_config}" \
     --inference_asr_model valid.loss.ave.pth \
+    --local_data_opts "--context ${context}" \
+    --asr_stats_dir "exp/asr_stats_context${context}_raw_en_word_sp" \
     --train_set "${train_set}" \
     --valid_set "${valid_set}" \
-    --test_sets "${test_sets}" "$@"
+    --test_sets "${test_sets}" \
+    --lm_train_text "data/${train_set}/text" \
+    --feats-normalize null "$@"
diff --git a/egs2/swbd_sentiment/asr1/README.md b/egs2/swbd_sentiment/asr1/README.md
new file mode 100644
index 00000000000..84ee7efbbf1
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/README.md
@@ -0,0 +1,35 @@
+# RESULTS
+## Dataset
+- Speech Sentiment Annotations (Switchboard Sentiment)
+   - Data: https://catalog.ldc.upenn.edu/LDC2020T14
+   - Paper: https://catalog.ldc.upenn.edu/docs/LDC2020T14/LREC_2020_Switchboard_Senti.pdf
+
+## Environments
+- date: `Thu Mar  3 21:34:18 EST 2022`
+- python version: `3.7.11 (default, Jul 27 2021, 14:32:16)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.9.0+cu102`
+- Git hash: `3b53aedc654fd30a828689c2139a1e130adac077`
+  - Commit date: `Fri Feb 25 00:13:16 2022 -0500`
+
+## Using Conformer based encoder and Transformer based decoder with spectral augmentation and predicting transcript along with sentiment
+- ASR config: [conf/tuning/train_asr_conformer.yaml](conf/tuning/train_asr_conformer.yaml)
+- token_type: word
+- labels: Positive, Neutral, Negative
+- Pre-trained Model: https://huggingface.co/espnet/YushiUeda_swbd_sentiment_asr_train_asr_conformer
+
+|dataset|Snt|Intent Classification Macro F1 (%)| Weighted F1 (%)| Micro F1 (%)|
+|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave_10best/valid|2415|61.0|65.0|65.6|
+|decode_asr_asr_model_valid.acc.ave_10best/test|2438|61.4|64.4|64.6|
+
+## Using Conformer based encoder, Transformer based decoder and self-supervised learning features (Wav2vec2.0) with spectral augmentation and predicting transcript along with sentiment
+- ASR config: [conf/tuning/train_asr_conformer_wav2vec2.yaml](conf/tuning/train_asr_conformer_wav2vec2.yaml)
+- token_type: word
+- labels: Positive, Neutral, Negative
+- Pre-trained Model: https://huggingface.co/espnet/YushiUeda_swbd_sentiment_asr_train_asr_conformer_wav2vec2
+
+|dataset|Snt|Intent Classification Macro F1 (%)| Weighted F1 (%)| Micro F1 (%)|
+|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave_10best/valid|2415|64.5|67.5|67.4|
+|decode_asr_asr_model_valid.acc.ave_10best/test|2438|64.1|66.5|66.3|
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/asr.sh b/egs2/swbd_sentiment/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/cmd.sh b/egs2/swbd_sentiment/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/swbd_sentiment/asr1/conf/decode_asr.yaml b/egs2/swbd_sentiment/asr1/conf/decode_asr.yaml
new file mode 100644
index 00000000000..e2408c35e82
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/conf/decode_asr.yaml
@@ -0,0 +1 @@
+beam_size: 1
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/conf/fbank.conf b/egs2/swbd_sentiment/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/swbd_sentiment/asr1/conf/pbs.conf b/egs2/swbd_sentiment/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/swbd_sentiment/asr1/conf/pitch.conf b/egs2/swbd_sentiment/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/swbd_sentiment/asr1/conf/queue.conf b/egs2/swbd_sentiment/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/swbd_sentiment/asr1/conf/slurm.conf b/egs2/swbd_sentiment/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/swbd_sentiment/asr1/conf/train_asr.yaml b/egs2/swbd_sentiment/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..10fea4f0971
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+tuning/train_asr_conformer.yaml
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/conf/tuning/train_asr_conformer.yaml b/egs2/swbd_sentiment/asr1/conf/tuning/train_asr_conformer.yaml
new file mode 100644
index 00000000000..f5104e2d6ea
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/conf/tuning/train_asr_conformer.yaml
@@ -0,0 +1,62 @@
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+optim: adam
+optim_conf:
+    lr: 0.0025
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 40000
+batch_type: numel
+batch_bins: 40000000
+accum_grad: 3
+max_epoch: 50
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
diff --git a/egs2/swbd_sentiment/asr1/conf/tuning/train_asr_conformer_wav2vec2.yaml b/egs2/swbd_sentiment/asr1/conf/tuning/train_asr_conformer_wav2vec2.yaml
new file mode 100644
index 00000000000..92b01329911
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/conf/tuning/train_asr_conformer_wav2vec2.yaml
@@ -0,0 +1,90 @@
+# network architecture
+# encoder related
+
+encoder: conformer
+encoder_conf:
+  output_size: 512
+  attention_heads: 8
+  linear_units: 2048
+  num_blocks: 12
+  dropout_rate: 0.1
+  positional_dropout_rate: 0.1
+  attention_dropout_rate: 0.1
+  input_layer: conv2d
+  normalize_before: true
+  macaron_style: true
+  pos_enc_layer_type: "rel_pos"
+  selfattention_layer_type: "rel_selfattn"
+  activation_type: "swish"
+  use_cnn_module: true
+  cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+  attention_heads: 8
+  linear_units: 2048
+  num_blocks: 6
+  dropout_rate: 0.1
+  positional_dropout_rate: 0.1
+  self_attention_dropout_rate: 0.1
+  src_attention_dropout_rate: 0.1
+
+optim: adam
+optim_conf:
+  lr: 0.0025
+scheduler: warmuplr   # pytorch v1.1.0+ required #Tune warmup steps
+scheduler_conf:
+  warmup_steps: 25000
+batch_type: numel
+batch_bins: 40000000
+accum_grad: 3
+max_epoch: 50
+
+freeze_param: [
+"frontend.upstream"
+]
+
+frontend_conf:
+ n_fft: 512
+ hop_length: 256
+
+frontend: s3prl
+frontend_conf:
+  frontend_conf:
+    upstream: wav2vec2_large_ll60k # Note: If the upstream is changed, please change the input_size in the preencoder.
+    # If using hubert, change the above line to "upstream: hubert_large_ll60k"
+  download_dir: ./hub
+  multilayer_feature: True
+
+preencoder: linear
+preencoder_conf:
+  input_size: 1024 # Note: If the upstream is changed, please change this value accordingly.
+  output_size: 80
+
+model_conf:
+  ctc_weight: 0.3
+  lsm_weight: 0.1
+  length_normalized_loss: false
+  extract_feats_in_collect_stats: false  # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+specaug: specaug
+specaug_conf:
+  apply_time_warp: true
+  time_warp_window: 5
+  time_warp_mode: bicubic
+  apply_freq_mask: true
+  freq_mask_width_range:
+  - 0
+  - 30
+  num_freq_mask: 2
+  apply_time_mask: true
+  time_mask_width_range:
+  - 0
+  - 40
+  num_time_mask: 2
+
+best_model_criterion:
+- - valid
+  - acc
+  - max
+keep_nbest_models: 10
diff --git a/egs2/swbd_sentiment/asr1/db.sh b/egs2/swbd_sentiment/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/local/MSU_single_letter.txt b/egs2/swbd_sentiment/asr1/local/MSU_single_letter.txt
new file mode 120000
index 00000000000..dd1bbcd661f
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/local/MSU_single_letter.txt
@@ -0,0 +1 @@
+../../../../egs/swbd/asr1/local/MSU_single_letter.txt
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/local/data.sh b/egs2/swbd_sentiment/asr1/local/data.sh
new file mode 100755
index 00000000000..54df2f8eb97
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/local/data.sh
@@ -0,0 +1,80 @@
+#!/bin/bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+
+stage=1
+stop_stage=4
+
+log "$0 $*"
+. utils/parse_options.sh
+
+. ./db.sh
+. ./path.sh
+. ./cmd.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+if [ -z "${SWBD}" ]; then
+    log "Fill the value of 'SWBD' of db.sh"
+    exit 1
+fi
+
+
+# we assume that LDC97S62 & speech_sentiment_annotations are placed under SWBD
+swbd1_dir=${SWBD}/LDC97S62
+swbd_sentiment=${SWBD}/speech_sentiment_annotations/data/sentiment_labels.tsv
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log " Data Preparation"
+    local/swbd1_data_download.sh ${swbd1_dir}
+    local/swbd1_prepare_dict.sh
+    local/swbd1_data_prep.sh ${swbd1_dir}
+    # upsample audio from 8k to 16k to make a recipe consistent with others
+    sed -i.bak -e "s/$/ sox -R -t wav - -t wav - rate 16000 dither | /" data/train/wav.scp
+    utils/fix_data_dir.sh data/train
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log " Data Formatting"
+     # remove ._ . _1 symbols from text  
+     cp data/train/text data/train/text.backup
+     sed -i 's/\._/ /g; s/\.//g; s/them_1/them/g' data/train/text
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    log " Concatenate Sentiment with Transcription"
+    # Concatenate sentiment (Positive, Negative, Neutral) with transcription. 
+    # Using sentiment annotation reconciliation strategy based on majority voting as in
+    # https://catalog.ldc.upenn.edu/docs/LDC2020T14/LREC_2020_Switchboard_Senti.pdf
+    # This stage may take a while
+    mkdir -p data/local/tmp/
+    mv -f data/train/* data/local/tmp/.
+    mkdir -p data/dev/
+    mkdir -p data/test/
+    python3 local/prepare_sentiment.py \
+        --train_dir data/train/ \
+        --dev_dir data/dev/ \
+        --test_dir data/test/ \
+        --sentiment_file ${swbd_sentiment} \
+        --text_file data/local/tmp/text \
+        --wavscp_file data/local/tmp/wav.scp
+    for dir in train dev test; do
+    utils/utt2spk_to_spk2utt.pl data/$dir/utt2spk > data/$dir/spk2utt
+    utils/fix_data_dir.sh data/$dir
+    done
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/swbd_sentiment/asr1/local/dict.patch b/egs2/swbd_sentiment/asr1/local/dict.patch
new file mode 120000
index 00000000000..f3e0d14e91a
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/local/dict.patch
@@ -0,0 +1 @@
+../../../../egs/swbd/asr1/local/dict.patch
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/local/extend_segments.pl b/egs2/swbd_sentiment/asr1/local/extend_segments.pl
new file mode 120000
index 00000000000..63065555357
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/local/extend_segments.pl
@@ -0,0 +1 @@
+../../../../egs/swbd/asr1/local/extend_segments.pl
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/local/format_acronyms_dict.py b/egs2/swbd_sentiment/asr1/local/format_acronyms_dict.py
new file mode 120000
index 00000000000..29f6093eade
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/local/format_acronyms_dict.py
@@ -0,0 +1 @@
+../../../../egs/swbd/asr1/local/format_acronyms_dict.py
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/local/map_acronyms_transcripts.py b/egs2/swbd_sentiment/asr1/local/map_acronyms_transcripts.py
new file mode 120000
index 00000000000..1d1cda1cc86
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/local/map_acronyms_transcripts.py
@@ -0,0 +1 @@
+../../../../egs/swbd/asr1/local/map_acronyms_transcripts.py
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/local/path.sh b/egs2/swbd_sentiment/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/swbd_sentiment/asr1/local/prepare_sentiment.py b/egs2/swbd_sentiment/asr1/local/prepare_sentiment.py
new file mode 100755
index 00000000000..8921fa4272d
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/local/prepare_sentiment.py
@@ -0,0 +1,174 @@
+import os
+import re
+import argparse
+import math
+
+
+def float2str(number, size=6):
+    number = str(math.ceil(number * 100))
+    return (size - len(number)) * "0" + number
+
+
+def majorityvote(line):
+    count_pos = line.count("Positive")
+    count_neu = line.count("Neutral")
+    count_neg = line.count("Negative")
+    dic = {"Positive": count_pos, "Neutral": count_neu, "Negative": count_neg}
+    max_value = max(dic.values())
+    # make sure max_value is unique
+    keys = [key for key, value in dic.items() if value == max_value]
+    label = keys[0] if len(keys) == 1 else -1
+    return label
+
+
+def normalize_transcript(transcript):
+    # remove punctuation except apostrophes
+    transcript = re.sub(r"(\.|\,|\?|\!|\-|\:|\;)", " \\1 ", transcript)
+    transcript = re.sub(r"\.|\,|\?|\!|\-|\:|\;", "", transcript)
+    # remove tag (e.g. [LAUGHTER])
+    transcript = re.sub(r"\[.+\]", "", transcript)
+    # Detect valid apostrophe cases and split those into two words
+    transcript = re.sub("([a-z])'([a-z])", "\\1 '\\2", transcript)
+    # Clean up special cases of standalone apostrophes
+    transcript = re.sub("([a-z])' ", "\\1 ", transcript)
+    # remove extra spaces
+    transcript = re.sub(" +", " ", transcript)
+    # remove space at the beginning of the utterance
+    transcript = re.sub("^ ", "", transcript)
+    return transcript
+
+
+def process_data(
+    target_dir, sentiment_file, text_file, wavscp_file, start_linenum, end_linenum
+):
+    if not os.path.exists(target_dir):
+        os.makedirs(target_dir)
+
+    utt2spk_list = []
+    segments_list = []
+    text_list = []
+    wavscp_list = []
+    reco2file_list = []
+
+    with open(sentiment_file, "r", encoding="utf-8") as sf, open(
+        text_file, "r", encoding="utf-8"
+    ) as tf, open(wavscp_file, "r", encoding="utf-8") as wf:
+        prev_spk_id_tf = 0
+        prev_linenum_tf = 0
+        prev_linenum_wf = 0
+        for linenum, line_sf in enumerate(sf):
+            if linenum >= start_linenum and linenum < end_linenum:
+                # "sw02005_0[tab]0.0[tab]11.287375[tab]
+                # Neutral-{Questioning}#Neutral-{No emotion}#Neutral-{No emotion}"
+                utt_id_sf, start, end, sentiment = line_sf.strip().split("\t")
+                # "sw02005_0" -> "sw02005"
+                reco_id_sf = utt_id_sf.split("_")[0]
+                label = majorityvote(sentiment)
+                if label != -1:
+                    tf.seek(0)
+                    for linenum_tf, line_tf in enumerate(tf):
+                        if linenum_tf >= prev_linenum_tf:
+                            # "sw02001-A_018732-018950 oh i see uh-huh"
+                            # -> "sw02001-A_018732-018950" "oh i see uh-huh"
+                            utt_id_tf, transcript = line_tf.strip("\n").split(" ", 1)
+                            # "sw02001-A_018732-018950" -> "sw02001-A" "018732-018950"
+                            spk_id_tf, time_id = utt_id_tf.split("_")
+                            # "sw02001-A" -> "sw02001"
+                            reco_id_tf = spk_id_tf.split("-")[0]
+                            # "018732-018950" -> "018732" "018950"
+                            start_time_id, end_time_id = time_id.split("-")
+                            # in case start and end time slightly differ
+                            # in text and sentiment annotation
+                            eps = 0.05
+                            if (
+                                reco_id_tf == reco_id_sf
+                                and start_time_id >= float2str(float(start) - eps)
+                                and start_time_id <= float2str(float(start) + eps)
+                                and end_time_id >= float2str(float(end) - eps)
+                                and end_time_id <= float2str(float(end) + eps)
+                            ):
+                                # normalize transcript
+                                transcript = normalize_transcript(transcript)
+                                utt2spk_list.append(
+                                    "{} {}".format(utt_id_tf, spk_id_tf)
+                                )
+                                segments_list.append(
+                                    "{} {} {:.2f} {:.2f}".format(
+                                        utt_id_tf, spk_id_tf, float(start), float(end)
+                                    )
+                                )
+                                text_list.append(
+                                    "{} {} {}".format(utt_id_tf, label, transcript)
+                                )
+
+                                if prev_spk_id_tf != spk_id_tf:
+                                    wf.seek(0)
+                                    for linenum_wf, line_wf in enumerate(wf):
+                                        if linenum_wf >= prev_linenum_wf:
+                                            spk_id_wf = line_wf.split(" ")[0]
+                                            if spk_id_wf == spk_id_tf:
+                                                wavscp_list.append(
+                                                    "{}".format(line_wf.strip("\n"))
+                                                )
+                                                (
+                                                    reco_id_wf,
+                                                    channel_id,
+                                                ) = spk_id_wf.split("-")
+                                                reco2file_list.append(
+                                                    "{} {} {}".format(
+                                                        spk_id_wf,
+                                                        reco_id_wf,
+                                                        channel_id,
+                                                    )
+                                                )
+                                                prev_linenum_wf = linenum_wf
+                                                break
+                                prev_spk_id_tf = spk_id_tf
+                                prev_linenum_tf = linenum_tf
+                                break
+    with open(
+        os.path.join(target_dir, "utt2spk"), "w", encoding="utf-8"
+    ) as utt2spk, open(
+        os.path.join(target_dir, "segments"), "w", encoding="utf-8"
+    ) as segments, open(
+        os.path.join(target_dir, "text"), "w", encoding="utf-8"
+    ) as text, open(
+        os.path.join(target_dir, "wav.scp"), "w", encoding="utf-8"
+    ) as wavscp, open(
+        os.path.join(target_dir, "reco2file_and_channel"), "w", encoding="utf-8"
+    ) as reco2file:
+        utt2spk.write("\n".join(utt2spk_list) + "\n")
+        segments.write("\n".join(segments_list) + "\n")
+        text.write("\n".join(text_list) + "\n")
+        wavscp.write("\n".join(wavscp_list) + "\n")
+        reco2file.write("\n".join(reco2file_list) + "\n")
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--train_dir", type=str, default="data/train")
+parser.add_argument("--dev_dir", type=str, default="data/dev")
+parser.add_argument("--test_dir", type=str, default="data/test")
+parser.add_argument("--sentiment_file", type=str, required=True)
+parser.add_argument("--text_file", type=str, default="data/train/tmp/text")
+parser.add_argument("--wavscp_file", type=str, default="data/train/tmp/wav.scp")
+
+args = parser.parse_args()
+
+# Split into train, dev, test
+# Note that there is no "official" split provided.
+# Using the proportion of train 90%, dev 5%, test 5% as in
+# https://arxiv.org/pdf/1911.09762.pdf
+print("start train file preparation...this may take a while")
+process_data(
+    args.train_dir, args.sentiment_file, args.text_file, args.wavscp_file, 0, 47056
+)
+print("start dev file preparation")
+process_data(
+    args.dev_dir, args.sentiment_file, args.text_file, args.wavscp_file, 47056, 49673
+)
+print("start test file preparation")
+process_data(
+    args.test_dir, args.sentiment_file, args.text_file, args.wavscp_file, 49673, 52293
+)
+
+print("Successfully finished text, utt2spk, segments, wavescp preparation")
diff --git a/egs2/swbd_sentiment/asr1/local/score.py b/egs2/swbd_sentiment/asr1/local/score.py
new file mode 120000
index 00000000000..32c1318cf74
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/local/score.py
@@ -0,0 +1 @@
+../../../TEMPLATE/asr1/pyscripts/utils/score_intent.py
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/local/score.sh b/egs2/swbd_sentiment/asr1/local/score.sh
new file mode 120000
index 00000000000..938c01f1250
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/local/score.sh
@@ -0,0 +1 @@
+../../../slue-voxceleb/asr1/local/score.sh
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/local/score_f1.py b/egs2/swbd_sentiment/asr1/local/score_f1.py
new file mode 100755
index 00000000000..a36c37c7b1f
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/local/score_f1.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+
+# Copyright 2022  Yushi Ueda
+#           2022  Carnegie Mellon University
+# Apache 2.0
+
+
+import os
+import re
+import sys
+import pandas as pd
+import argparse
+from sklearn.metrics import f1_score
+
+
+def get_classification_result(hyp_file, ref_file):
+    hyp_lines = [line for line in hyp_file]
+    ref_lines = [line for line in ref_file]
+    hyp_list = []
+    ref_list = []
+    for line_count in range(len(hyp_lines)):
+        hyp_list.append(hyp_lines[line_count].split(" ")[0])
+        ref_list.append(ref_lines[line_count].split(" ")[0])
+    macro_f1 = f1_score(
+        ref_list, hyp_list, average="macro", labels=["Positive", "Neutral", "Negative"]
+    )
+    weighted_f1 = f1_score(
+        ref_list,
+        hyp_list,
+        average="weighted",
+        labels=["Positive", "Neutral", "Negative"],
+    )
+    micro_f1 = f1_score(
+        ref_list, hyp_list, average="micro", labels=["Positive", "Neutral", "Negative"]
+    )
+    return macro_f1, weighted_f1, micro_f1
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--exp_root", required=True, help="Directory to save experiments")
+parser.add_argument(
+    "--valid_folder",
+    default="inference_asr_model_valid.acc.ave_10best/devel/",
+    help="Directory inside exp_root containing inference on valid set",
+)
+parser.add_argument(
+    "--test_folder",
+    default="inference_asr_model_valid.acc.ave_10best/test/",
+    help="Directory inside exp_root containing inference on test set",
+)
+parser.add_argument(
+    "--utterance_test_folder",
+    default=None,
+    help="Directory inside exp_root containing inference on utterance test set",
+)
+
+args = parser.parse_args()
+
+exp_root = args.exp_root
+valid_inference_folder = args.valid_folder
+test_inference_folder = args.test_folder
+
+valid_hyp_file = open(
+    os.path.join(exp_root, valid_inference_folder + "score_wer/hyp.trn")
+)
+valid_ref_file = open(
+    os.path.join(exp_root, valid_inference_folder + "score_wer/ref.trn")
+)
+
+macro_f1, weighted_f1, micro_f1 = get_classification_result(
+    valid_hyp_file, valid_ref_file
+)
+print("Valid Intent Classification Result")
+print(
+    "macro f1:{}, weighted f1:{}, micro f1:{}".format(macro_f1, weighted_f1, micro_f1)
+)
+
+test_hyp_file = open(
+    os.path.join(exp_root, test_inference_folder + "score_wer/hyp.trn")
+)
+test_ref_file = open(
+    os.path.join(exp_root, test_inference_folder + "score_wer/ref.trn")
+)
+
+macro_f1, weighted_f1, micro_f1 = get_classification_result(
+    test_hyp_file, test_ref_file
+)
+print("Test Intent Classification Result")
+print(
+    "macro f1:{}, weighted f1:{}, micro f1:{}".format(macro_f1, weighted_f1, micro_f1)
+)
+
+if args.utterance_test_folder is not None:
+    utt_test_inference_folder = args.utterance_test_folder
+    utt_test_hyp_file = open(
+        os.path.join(exp_root, utt_test_inference_folder + "score_wer/hyp.trn")
+    )
+    utt_test_ref_file = open(
+        os.path.join(exp_root, utt_test_inference_folder + "score_wer/ref.trn")
+    )
+    macro_f1, weighted_f1, micro_f1 = get_classification_result(
+        utt_test_hyp_file, utt_test_ref_file
+    )
+    print("Unseen Utterance Test Intent Classification Result")
+    print(
+        "macro f1:{}, weighted f1:{}, micro f1:{}".format(
+            macro_f1, weighted_f1, micro_f1
+        )
+    )
diff --git a/egs2/swbd_sentiment/asr1/local/swbd1_data_download.sh b/egs2/swbd_sentiment/asr1/local/swbd1_data_download.sh
new file mode 120000
index 00000000000..dfc7b6be51e
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/local/swbd1_data_download.sh
@@ -0,0 +1 @@
+../../../../egs/swbd/asr1/local/swbd1_data_download.sh
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/local/swbd1_data_prep.sh b/egs2/swbd_sentiment/asr1/local/swbd1_data_prep.sh
new file mode 120000
index 00000000000..2c88651a694
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/local/swbd1_data_prep.sh
@@ -0,0 +1 @@
+../../../../egs/swbd/asr1/local/swbd1_data_prep.sh
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/local/swbd1_fix_speakerid.pl b/egs2/swbd_sentiment/asr1/local/swbd1_fix_speakerid.pl
new file mode 120000
index 00000000000..895ea088e4a
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/local/swbd1_fix_speakerid.pl
@@ -0,0 +1 @@
+../../../../egs/swbd/asr1/local/swbd1_fix_speakerid.pl
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/local/swbd1_map_words.pl b/egs2/swbd_sentiment/asr1/local/swbd1_map_words.pl
new file mode 120000
index 00000000000..7c0014e683a
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/local/swbd1_map_words.pl
@@ -0,0 +1 @@
+../../../../egs/swbd/asr1/local/swbd1_map_words.pl
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/local/swbd1_prepare_dict.sh b/egs2/swbd_sentiment/asr1/local/swbd1_prepare_dict.sh
new file mode 120000
index 00000000000..88fa6f959b1
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/local/swbd1_prepare_dict.sh
@@ -0,0 +1 @@
+../../../../egs/swbd/asr1/local/swbd1_prepare_dict.sh
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/path.sh b/egs2/swbd_sentiment/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/pyscripts b/egs2/swbd_sentiment/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/run.sh b/egs2/swbd_sentiment/asr1/run.sh
new file mode 100755
index 00000000000..498511133e9
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/run.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set="train"
+valid_set="dev"
+test_sets="test dev"
+
+asr_config=conf/train_asr.yaml
+inference_config=conf/decode_asr.yaml
+
+./asr.sh \
+    --lang en \
+    --ngpu 1 \
+    --use_lm false \
+    --nbpe 5000 \
+    --token_type word\
+    --feats_type raw\
+    --max_wav_duration 30 \
+    --inference_nj 8 \
+    --inference_asr_model valid.acc.ave_10best.pth\
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --feats_normalize "utterance_mvn" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" "$@"
diff --git a/egs2/swbd_sentiment/asr1/scripts b/egs2/swbd_sentiment/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/steps b/egs2/swbd_sentiment/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/utils b/egs2/swbd_sentiment/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/tsukuyomi/tts1/README.md b/egs2/tsukuyomi/tts1/README.md
index de8dff4bef0..1c6226bee3f 100644
--- a/egs2/tsukuyomi/tts1/README.md
+++ b/egs2/tsukuyomi/tts1/README.md
@@ -200,7 +200,7 @@ $ ls downloads/f3698edf589206588f58f5ec837fa516/exp/exp/tts_train_vits_raw_phn_j
 config.yaml  images  train.total_count.ave_10best.pth
 ```
 
-Let us replace the `tokens.txt` and `feats_stats.npz` with pretrained model's one.
+Let us replace the `tokens.txt` with pretrained model's one.
 ```sh
 # Make backup (Rename -> *.bak)
 $ mv dump/22k/token_list/phn_jaconv_pyopenjtalk_accent_with_pause/tokens.{txt,txt.bak}
diff --git a/egs2/vctk/asr1/RESULTS.md b/egs2/vctk/asr1/RESULTS.md
new file mode 100644
index 00000000000..184887ca406
--- /dev/null
+++ b/egs2/vctk/asr1/RESULTS.md
@@ -0,0 +1,59 @@
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+# RESULTS
+
+## Speaker closed setting (based on TTS's data preparation)
+- https://datashare.ed.ac.uk/handle/10283/3443
+- About 80% of the transcription in the evaluation data is covered by the training data in speaker closed condition.
+- Pre-trained model: https://huggingface.co/espnet/YosukeKashiwagi_vctk_asr_train_asr_transformer/tree/main/speaker_closed
+
+### Environments
+- date: `Thu Mar 10 09:51:35 EST 2022`
+- python version: `3.9.5 (default, Jun  4 2021, 12:28:51)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.10.1+cu102`
+- Git hash: `a3e1543e96c1088bfed846d5c68c6f444a55aa75`
+  - Commit date: `Mon Feb 14 13:28:05 2022 -0500`
+
+### asr_train_asr_transformer_raw_char_sp
+#### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/dev|540|3657|95.2|4.0|0.7|0.6|5.4|18.7|
+|decode_asr_asr_model_valid.acc.ave/eval1|540|3760|95.6|3.6|0.8|0.5|4.9|19.4|
+
+#### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/dev|540|19387|98.2|0.8|1.0|0.5|2.2|18.7|
+|decode_asr_asr_model_valid.acc.ave/eval1|540|19819|98.4|0.7|0.9|0.5|2.1|19.4|
+
+## Speaker open setting (based on noisy-vctk's data preparation)
+- https://datashare.ed.ac.uk/handle/10283/2791
+- About 90% of the transcription in the evaluation data is covered by the training data in speaker open condition.
+- Pre-trained model: https://huggingface.co/espnet/YosukeKashiwagi_vctk_asr_train_asr_transformer/tree/main/speaker_open
+
+## RESULTS
+### Environments
+- date: `Thu Mar 10 09:50:28 EST 2022`
+- python version: `3.9.5 (default, Jun  4 2021, 12:28:51)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.10.1+cu102`
+- Git hash: `a3e1543e96c1088bfed846d5c68c6f444a55aa75`
+  - Commit date: `Mon Feb 14 13:28:05 2022 -0500`
+
+### asr_train_asr_transformer_raw_char_sp
+#### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/dev|770|5804|92.3|6.8|0.9|1.0|8.7|32.7|
+|decode_asr_asr_model_valid.acc.ave/eval1|824|6221|82.4|14.5|3.0|1.4|19.0|59.2|
+
+#### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/dev|770|29970|97.4|1.1|1.4|0.6|3.2|32.7|
+|decode_asr_asr_model_valid.acc.ave/eval1|824|32785|93.2|2.5|4.3|0.9|7.7|59.2|
\ No newline at end of file
diff --git a/egs2/vctk/asr1/asr.sh b/egs2/vctk/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/vctk/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/vctk/asr1/cmd.sh b/egs2/vctk/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/vctk/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/vctk/asr1/conf/decode_asr.yaml b/egs2/vctk/asr1/conf/decode_asr.yaml
new file mode 100644
index 00000000000..58a3dcf022b
--- /dev/null
+++ b/egs2/vctk/asr1/conf/decode_asr.yaml
@@ -0,0 +1,6 @@
+lm_weight: 0.0
+beam_size: 20
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.3
diff --git a/egs2/vctk/asr1/conf/fbank.conf b/egs2/vctk/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/vctk/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/vctk/asr1/conf/pbs.conf b/egs2/vctk/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/vctk/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/vctk/asr1/conf/pitch.conf b/egs2/vctk/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/vctk/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/vctk/asr1/conf/queue.conf b/egs2/vctk/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/vctk/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/vctk/asr1/conf/slurm.conf b/egs2/vctk/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/vctk/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/vctk/asr1/conf/train_asr_transformer.yaml b/egs2/vctk/asr1/conf/train_asr_transformer.yaml
new file mode 100644
index 00000000000..8958728c610
--- /dev/null
+++ b/egs2/vctk/asr1/conf/train_asr_transformer.yaml
@@ -0,0 +1,62 @@
+batch_type: numel
+batch_bins: 16000000
+accum_grad: 4
+max_epoch: 200
+patience: none
+# The initialization method for model parameters
+init: xavier_uniform
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+encoder: transformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d
+    normalize_before: true
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.002
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 25000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/vctk/asr1/db.sh b/egs2/vctk/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/vctk/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/vctk/asr1/local/data.sh b/egs2/vctk/asr1/local/data.sh
new file mode 100755
index 00000000000..f7af83950a1
--- /dev/null
+++ b/egs2/vctk/asr1/local/data.sh
@@ -0,0 +1,87 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+stage=-1
+stop_stage=2
+use_speakeropen=false
+
+help_message=$(cat << EOF
+Usage: $0 
+  optional argument:
+    None
+EOF
+)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+
+log "$0 $*"
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    echo "${help_message}"
+    exit 1;
+fi
+
+if "${use_speakeropen}"; then
+
+    if [ ! -e "${NOISY_SPEECH}" ] ; then
+        log "
+        Please fill the value of 'NOISY_SPEECH' in db.sh
+        The 'NOISY_SPEECH' (https://doi.org/10.7488/ds/2117) directory
+        should at least contain the clean speech and the clean text:
+            noisy_speech
+            ├── clean_testset_wav
+            ├── clean_trainset_28spk_wav
+            ├── testset_txt
+            └── trainset_28spk_txt
+        "
+	exit 1
+    fi
+
+    if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+	log "stage 0: local/data_prep_speaker_open.sh"
+	# Initial normalization of the data
+	# Doesn't change sampling frequency and it's done after stages
+    local/data_prep_speaker_open.sh  ${NOISY_SPEECH} || exit 1;
+    fi
+
+else
+    
+    if [ -z "${VCTK}" ]; then
+	log "Please fill the value of 'VCTK' of db.sh"
+	exit 1
+    fi
+    db_root=${VCTK}
+    
+    train_set=tr_no_dev
+    dev_set=dev
+    eval_set=eval1
+
+    if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+	log "stage -1: Data Download"
+	local/data_download.sh "${db_root}"
+    fi
+    
+    if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+	log "stage 0: local/data_prep_speaker_closed.sh"
+	# Initial normalization of the data
+	# Doesn't change sampling frequency and it's done after stages
+	local/data_prep_speaker_closed.sh \
+            --train_set "${train_set}" \
+            --dev_set "${dev_set}" \
+            --eval_set "${eval_set}" \
+            "${db_root}"/VCTK-Corpus
+    fi
+
+fi
diff --git a/egs2/vctk/asr1/local/data_download.sh b/egs2/vctk/asr1/local/data_download.sh
new file mode 120000
index 00000000000..da6b5a37427
--- /dev/null
+++ b/egs2/vctk/asr1/local/data_download.sh
@@ -0,0 +1 @@
+../../tts1/local/data_download.sh
\ No newline at end of file
diff --git a/egs2/vctk/asr1/local/data_prep_speaker_closed.sh b/egs2/vctk/asr1/local/data_prep_speaker_closed.sh
new file mode 120000
index 00000000000..1c53d10ee72
--- /dev/null
+++ b/egs2/vctk/asr1/local/data_prep_speaker_closed.sh
@@ -0,0 +1 @@
+../../tts1/local/data_prep.sh
\ No newline at end of file
diff --git a/egs2/vctk/asr1/local/data_prep_speaker_open.sh b/egs2/vctk/asr1/local/data_prep_speaker_open.sh
new file mode 100755
index 00000000000..b0abe73a17d
--- /dev/null
+++ b/egs2/vctk/asr1/local/data_prep_speaker_open.sh
@@ -0,0 +1,82 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+
+if [ $# -ne 1 ]; then
+  echo "Arguments should be NOISY_SPEECH wav path, see local/data.sh for example."
+  exit 1;
+fi
+
+NOISY_SPEECH=$1
+# check if the wav dirs exist.
+
+for ddir in clean_trainset_28spk_wav clean_testset_wav trainset_28spk_txt testset_txt; do
+  f=${NOISY_SPEECH}/${ddir}
+  if [ ! -d $f ]; then
+    echo "Error: $f is not a directory."
+    exit 1;
+  fi
+done
+
+data=./data
+rm -r ${data}/tr_26spk 2>/dev/null || true
+rm -r ${data}/{cv, tt}_2spk 2>/dev/null || true
+
+tmpdir=data/temp
+rm -r  $tmpdir 2>/dev/null || true
+mkdir -p $tmpdir 
+
+train_dir=${NOISY_SPEECH}/clean_trainset_28spk_wav
+test_dir=${NOISY_SPEECH}/clean_testset_wav
+
+echo "Building training and testing data"
+
+find $train_dir -name '*.wav' -not -name 'p226_*.wav' -not -name 'p287_*.wav' | sort -u > $tmpdir/tr_no_dev.flist
+find $train_dir -name 'p226_*.wav' -o -name 'p287_*.wav' | sort -u > $tmpdir/dev.flist
+find $test_dir -name '*.wav' | sort -u > $tmpdir/eval1.flist
+
+
+for x in tr_no_dev dev eval1; do
+
+  if [ "${x}" == "tr_no_dev" -o "${x}" == "dev" ]; then
+      text_dir=${NOISY_SPEECH}/trainset_28spk_txt
+  else
+      text_dir=${NOISY_SPEECH}/testset_txt
+  fi      
+
+  sed -e 's:.*p\([0-9]*\)_\([0-9]*\).wav$:p\1_\2:i' $tmpdir/${x}.flist \
+  > $tmpdir/${x}.uttids
+
+  paste $tmpdir/${x}.uttids $tmpdir/${x}.flist \
+  | sort -k1,1 >  $tmpdir/${x}.scp
+  mkdir -p ${data}/${x}
+  cp $tmpdir/${x}.scp ${data}/${x}/wav.scp
+  
+  awk '{split($1, lst, "_"); spk=lst[1]; print($1, spk)}' ${data}/${x}/wav.scp | \
+    sort -u> ${data}/${x}/utt2spk
+  utt2spk_to_spk2utt.pl ${data}/${x}/utt2spk > ${data}/${x}/spk2utt
+
+  cat $tmpdir/${x}.uttids | \
+      while read uttid;
+      do
+	  if [ ! -f ${text_dir}/${uttid}.txt ]; then
+	      echo "missing text file for ${uttid}" 1>&2
+	      exit 1;
+	  fi
+	  echo "${uttid}" $(<${text_dir}/${uttid}.txt)
+      done | \
+	  sort -u > ${data}/${x}/text
+
+  sed -e "s#noisy_#clean_#g" ${data}/${x}/wav.scp \
+    > ${data}/${x}/spk1.scp
+done
+
+
+
diff --git a/egs2/vctk/asr1/local/path.sh b/egs2/vctk/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/vctk/asr1/path.sh b/egs2/vctk/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/vctk/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/vctk/asr1/pyscripts b/egs2/vctk/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/vctk/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/vctk/asr1/run.sh b/egs2/vctk/asr1/run.sh
new file mode 100755
index 00000000000..d363dc3a8a4
--- /dev/null
+++ b/egs2/vctk/asr1/run.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+# if ture, speaker open setting will be used.
+use_speakeropen=true
+train_set=tr_no_dev
+valid_set=dev
+test_sets="dev eval1"
+
+token_type=char
+
+asr_config=conf/train_asr_transformer.yaml
+inference_config=conf/decode_asr.yaml
+
+# speed perturbation related
+# (train_set will be "${train_set}_sp" if speed_perturb_factors is specified)
+speed_perturb_factors="1.1 0.9 1.0"
+
+./asr.sh \
+    --ngpu 4 \
+    --token_type "${token_type}" \
+    --feats_type raw \
+    --fs 16k \
+    --local_data_opts "--use_speakeropen ${use_speakeropen}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --use_lm false \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --speed_perturb_factors "${speed_perturb_factors}" \
+    --lm_train_text "data/${train_set}/text" "$@"
diff --git a/egs2/vctk/asr1/scripts b/egs2/vctk/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/vctk/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/vctk/asr1/steps b/egs2/vctk/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/vctk/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/vctk/asr1/utils b/egs2/vctk/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/vctk/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/vctk/tts1/conf/tuning/train_speechbrain_xvector_vits.yaml b/egs2/vctk/tts1/conf/tuning/train_speechbrain_xvector_vits.yaml
new file mode 100644
index 00000000000..971c41debe5
--- /dev/null
+++ b/egs2/vctk/tts1/conf/tuning/train_speechbrain_xvector_vits.yaml
@@ -0,0 +1,186 @@
+# This configuration is for ESPnet2 to train multi-speaker
+# VITS with x-vector (speechbrain) instead of sepeaker ID embedding.
+# To run this config, you need to specify "--tts_task gan_tts"
+# and "--use_xvector true" options for tts.sh at least and use
+# 22khz audio as the training data (mainly tested on LibriTTS).
+# This configuration tested on 4 GPUs (V100) with 32GB GPU
+# memory. It takes around 2 weeks to finish the training
+# but 100k iters model should generate reasonable results.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: vits
+tts_conf:
+    # generator related
+    generator_type: vits_generator
+    generator_params:
+        hidden_channels: 192
+        spks: -1
+        spk_embed_dim: 192
+        global_channels: 256
+        segment_size: 32
+        text_encoder_attention_heads: 2
+        text_encoder_ffn_expand: 4
+        text_encoder_blocks: 6
+        text_encoder_positionwise_layer_type: "conv1d"
+        text_encoder_positionwise_conv_kernel_size: 3
+        text_encoder_positional_encoding_layer_type: "rel_pos"
+        text_encoder_self_attention_layer_type: "rel_selfattn"
+        text_encoder_activation_type: "swish"
+        text_encoder_normalize_before: true
+        text_encoder_dropout_rate: 0.1
+        text_encoder_positional_dropout_rate: 0.0
+        text_encoder_attention_dropout_rate: 0.1
+        use_macaron_style_in_text_encoder: true
+        # NOTE(kan-bayashi): Conformer conv requires BatchNorm1d which causes
+        #   errors when multiple GPUs in pytorch 1.7.1. Therefore, we disable
+        #   it as a default. We need to consider the alternative normalization
+        #   or different version pytorch may solve this issue.
+        use_conformer_conv_in_text_encoder: false
+        text_encoder_conformer_kernel_size: -1
+        decoder_kernel_size: 7
+        decoder_channels: 512
+        decoder_upsample_scales: [8, 8, 2, 2]
+        decoder_upsample_kernel_sizes: [16, 16, 4, 4]
+        decoder_resblock_kernel_sizes: [3, 7, 11]
+        decoder_resblock_dilations: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+        use_weight_norm_in_decoder: true
+        posterior_encoder_kernel_size: 5
+        posterior_encoder_layers: 16
+        posterior_encoder_stacks: 1
+        posterior_encoder_base_dilation: 1
+        posterior_encoder_dropout_rate: 0.0
+        use_weight_norm_in_posterior_encoder: true
+        flow_flows: 4
+        flow_kernel_size: 5
+        flow_base_dilation: 1
+        flow_layers: 4
+        flow_dropout_rate: 0.0
+        use_weight_norm_in_flow: true
+        use_only_mean_in_flow: true
+        stochastic_duration_predictor_kernel_size: 3
+        stochastic_duration_predictor_dropout_rate: 0.5
+        stochastic_duration_predictor_flows: 4
+        stochastic_duration_predictor_dds_conv_layers: 3
+    # discriminator related
+    discriminator_type: hifigan_multi_scale_multi_period_discriminator
+    discriminator_params:
+        scales: 1
+        scale_downsample_pooling: "AvgPool1d"
+        scale_downsample_pooling_params:
+            kernel_size: 4
+            stride: 2
+            padding: 2
+        scale_discriminator_params:
+            in_channels: 1
+            out_channels: 1
+            kernel_sizes: [15, 41, 5, 3]
+            channels: 128
+            max_downsample_channels: 1024
+            max_groups: 16
+            bias: True
+            downsample_scales: [2, 2, 4, 4, 1]
+            nonlinear_activation: "LeakyReLU"
+            nonlinear_activation_params:
+                negative_slope: 0.1
+            use_weight_norm: True
+            use_spectral_norm: False
+        follow_official_norm: False
+        periods: [2, 3, 5, 7, 11]
+        period_discriminator_params:
+            in_channels: 1
+            out_channels: 1
+            kernel_sizes: [5, 3]
+            channels: 32
+            downsample_scales: [3, 3, 3, 3, 1]
+            max_downsample_channels: 1024
+            bias: True
+            nonlinear_activation: "LeakyReLU"
+            nonlinear_activation_params:
+                negative_slope: 0.1
+            use_weight_norm: True
+            use_spectral_norm: False
+    # loss function related
+    generator_adv_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        loss_type: mse                   # loss type, "mse" or "hinge"
+    discriminator_adv_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        loss_type: mse                   # loss type, "mse" or "hinge"
+    feat_match_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        average_by_layers: false         # whether to average loss value by #layers of each discriminator
+        include_final_outputs: true      # whether to include final outputs for loss calculation
+    mel_loss_params:
+        fs: 22050          # must be the same as the training data
+        n_fft: 1024        # fft points
+        hop_length: 256    # hop size
+        win_length: null   # window length
+        window: hann       # window type
+        n_mels: 80         # number of Mel basis
+        fmin: 0            # minimum frequency for Mel basis
+        fmax: null         # maximum frequency for Mel basis
+        log_base: null     # null represent natural log
+    lambda_adv: 1.0        # loss scaling coefficient for adversarial loss
+    lambda_mel: 45.0       # loss scaling coefficient for Mel loss
+    lambda_feat_match: 2.0 # loss scaling coefficient for feat match loss
+    lambda_dur: 1.0        # loss scaling coefficient for duration loss
+    lambda_kl: 1.0         # loss scaling coefficient for KL divergence loss
+    # others
+    sampling_rate: 22050          # needed in the inference for saving wav
+    cache_generator_outputs: true # whether to cache generator outputs in the training
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+# optimizer setting for generator
+optim: adamw
+optim_conf:
+    lr: 2.0e-4
+    betas: [0.8, 0.99]
+    eps: 1.0e-9
+    weight_decay: 0.0
+scheduler: exponentiallr
+scheduler_conf:
+    gamma: 0.999875
+# optimizer setting for discriminator
+optim2: adamw
+optim2_conf:
+    lr: 2.0e-4
+    betas: [0.8, 0.99]
+    eps: 1.0e-9
+    weight_decay: 0.0
+scheduler2: exponentiallr
+scheduler2_conf:
+    gamma: 0.999875
+generator_first: false # whether to start updating generator first
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 1000 # number of iterations per epoch
+max_epoch: 1000           # number of epochs
+accum_grad: 1             # gradient accumulation
+batch_bins: 5000000       # batch bins (feats_type=raw)
+batch_type: numel         # how to make batch
+grad_clip: -1             # gradient clipping norm
+grad_noise: false         # whether to use gradient noise injection
+sort_in_batch: descending # how to sort data in making batch
+sort_batch: descending    # how to sort created batches
+num_workers: 4            # number of workers of data loader
+use_amp: false            # whether to use pytorch amp
+log_interval: 50          # log interval in iterations
+keep_nbest_models: 10     # number of models to keep
+num_att_plot: 3           # number of attention figures to be saved in every check
+seed: 777                 # random seed number
+patience: null            # patience for early stopping
+unused_parameters: true   # needed for multi gpu case
+best_model_criterion:     # criterion to save the best models
+-   - train
+    - total_count
+    - max
+cudnn_deterministic: false # setting to false accelerates the training speed but makes it non-deterministic
+                           # in the case of GAN-TTS training, we strongly recommend setting to false
+cudnn_benchmark: false     # setting to true might acdelerate the training speed but sometimes decrease it
+                           # therefore, we set to false as a default (recommend trying both cases)
diff --git a/egs2/wsj/asr1/README.md b/egs2/wsj/asr1/README.md
index f87e60e1991..95f4d70d278 100644
--- a/egs2/wsj/asr1/README.md
+++ b/egs2/wsj/asr1/README.md
@@ -53,6 +53,38 @@
 |decode_lm_lm_train_lm_transformer_en_char_valid.loss.ave_asr_model_valid.acc.ave/test_eval92|333|33341|99.3|0.3|0.4|0.1|0.8|32.4|
 
 
+## Mask-CTC
+
+- Training config: [conf/tuning/train_asr_transformer_maskctc.yaml](conf/tuning/train_asr_transformer_maskctc.yaml)
+- Inference config:  [conf/tuning/inference_asr_maskctc.yaml](conf/tuning/inference_asr_maskctc.yaml)
+- Pretrained model: https://huggingface.co/espnet/YosukeHiguchi_espnet2_wsj_asr_transformer_maskctc
+
+### Environments
+
+- date: `Wed Mar 23 04:54:11 JST 2022`
+- python version: `3.8.12 (default, Oct 12 2021, 13:49:34)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- chainer version: `chainer 6.0.0`
+- pytorch version: `pytorch 1.10.1`
+- Git hash: `f29fc9d34f98635bca9e9f7860f3f6cb04300146`
+  - Commit date: `Tue Mar 22 05:48:17 2022 +0900`
+
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|inference_asr_maskctc_asr_model_valid.cer_ctc.ave_10best/test_dev93|503|8234|87.2|11.6|1.2|1.0|13.9|79.3|
+|inference_asr_maskctc_asr_model_valid.cer_ctc.ave_10best/test_eval92|333|5643|90.1|9.2|0.7|1.1|11.0|71.5|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|inference_asr_maskctc_asr_model_valid.cer_ctc.ave_10best/test_dev93|503|48634|96.7|1.7|1.6|1.0|4.2|81.3|
+|inference_asr_maskctc_asr_model_valid.cer_ctc.ave_10best/test_eval92|333|33341|97.7|1.3|1.1|1.0|3.3|76.0|
+
+
 ## Using Transformer LM (ASR model is same as the above): lm_weight=1.2, ctc_weight=0.3, beam_size=20
 
 - ASR config: [conf/tuning/train_asr_transformer2.yaml](conf/tuning/train_asr_transformer2.yaml)
diff --git a/egs2/wsj/asr1/conf/tuning/inference_asr_maskctc.yaml b/egs2/wsj/asr1/conf/tuning/inference_asr_maskctc.yaml
new file mode 100644
index 00000000000..7621cc71599
--- /dev/null
+++ b/egs2/wsj/asr1/conf/tuning/inference_asr_maskctc.yaml
@@ -0,0 +1,7 @@
+# Add the following options for running maskctc inference
+#   --inference_asr_model valid.acc_mlm.ave.pth
+#   --use_maskctc true
+# To run CTC greedy decoding, set maskctc_n_iterations to 1
+# and maskctc_threshold_probability to 0.0
+maskctc_n_iterations: 10
+maskctc_threshold_probability: 0.999
diff --git a/egs2/wsj/asr1/conf/tuning/train_asr_transformer_ctc.yaml b/egs2/wsj/asr1/conf/tuning/train_asr_transformer_ctc.yaml
new file mode 100644
index 00000000000..63989a65e7e
--- /dev/null
+++ b/egs2/wsj/asr1/conf/tuning/train_asr_transformer_ctc.yaml
@@ -0,0 +1,55 @@
+batch_type: folded
+batch_size: 32
+accum_grad: 8
+max_epoch: 300
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - cer_ctc
+    - min
+keep_nbest_models: 10
+
+model: espnet
+model_conf:
+    ctc_weight: 1.0
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+encoder: transformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d
+    normalize_before: true
+
+optim: adam
+optim_conf:
+    lr: 0.002
+    weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 15000
+
+num_att_plot: 0
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 27
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_ratio_range:
+    - 0.
+    - 0.05
+    num_time_mask: 5
\ No newline at end of file
diff --git a/egs2/wsj/asr1/conf/tuning/train_asr_transformer_maskctc.yaml b/egs2/wsj/asr1/conf/tuning/train_asr_transformer_maskctc.yaml
new file mode 100644
index 00000000000..fa4cd100542
--- /dev/null
+++ b/egs2/wsj/asr1/conf/tuning/train_asr_transformer_maskctc.yaml
@@ -0,0 +1,67 @@
+batch_type: folded
+batch_size: 32
+accum_grad: 8
+max_epoch: 300
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - cer_ctc
+    - min
+keep_nbest_models: 10
+
+# specify model type as "maskctc"
+model: maskctc
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+encoder: transformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d
+    normalize_before: true
+
+# Masked Language Model (MLM)-based decoder
+decoder: mlm
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+optim: adam
+optim_conf:
+    lr: 0.002
+    weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 15000
+
+num_att_plot: 0
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 27
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_ratio_range:
+    - 0.
+    - 0.05
+    num_time_mask: 5
\ No newline at end of file
diff --git a/egs2/wsj0_2mix/enh1/README.md b/egs2/wsj0_2mix/enh1/README.md
index d718e363ffb..a952cacec80 100644
--- a/egs2/wsj0_2mix/enh1/README.md
+++ b/egs2/wsj0_2mix/enh1/README.md
@@ -51,3 +51,124 @@
 |enhanced_cv_min_8k|0.96|19.04|18.54|29.15|
 |enhanced_tt_min_8k|0.96|18.82|18.29|28.92|
 
+<!-- Generated by ./scripts/utils/show_enh_score.sh -->
+# RESULTS
+## Environments
+- date: `Wed Feb 23 16:42:06 CST 2022`
+- python version: `3.7.11 (default, Jul 27 2021, 14:32:16)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.8.1`
+- Git hash: `ac3c10cfe4faf82c0bb30f8b32d9e8692363e0a9`
+  - Commit date: `Fri Feb 11 16:22:52 2022 +0800`
+
+
+## enh_train_enh_skim_tasnet_noncausal_raw
+
+ - config: conf/tuning/train_enh_skim_tasnet_noncausal.yaml
+ - Pretrained model: https://huggingface.co/lichenda/wsj0_2mix_skim_noncausal
+
+|dataset|STOI|SAR|SDR|SIR|
+|---|---|---|---|---|
+|enhanced_cv_min_8k|0.96|19.17|18.70|29.56|
+|enhanced_tt_min_8k|0.97|18.96|18.45|29.31|
+
+<!-- Generated by ./scripts/utils/show_enh_score.sh -->
+# RESULTS
+## Environments
+- date: `Thu Feb 24 16:26:21 CST 2022`
+- python version: `3.8.10 (default, May 19 2021, 18:05:58)  [GCC 7.3.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.5.1+cu101`
+- Git hash: `c58adabbe1b83dcd0b616ecd336b4a0648334e2c`
+  - Commit date: `Wed Feb 16 14:20:38 2022 +0800`
+
+
+## enh_train_enh_dpcl_raw
+
+  - config: conf/tuning/train_enh_dpcl.yaml
+  - Pretrained model: https://huggingface.co/Yulinfeng/wsj0_2mix_enh_train_enh_dpcl_raw_valid.si_snr.ave
+
+|dataset|PESQ|STOI|SAR|SDR|SIR|SI_SNR|
+|---|---|---|---|---|---|---|
+|enhanced_cv_min_8k|2.18|0.84|9.63|8.59|17.31|8.04|
+|enhanced_tt_min_8k|2.15|0.84|9.51|8.45|17.22|7.91|
+
+<!-- Generated by ./scripts/utils/show_enh_score.sh -->
+# RESULTS
+## Environments
+- date: `Thu Mar  3 17:10:03 CST 2022`
+- python version: `3.8.10 (default, May 19 2021, 18:05:58)  [GCC 7.3.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.5.1+cu101`
+- Git hash: `ec1acec03d109f06d829b80862e0388f7234d0d1`
+  - Commit date: `Fri Feb 25 14:12:45 2022 +0800`
+
+
+## enh_train_enh_mdc_raw
+
+  - config: conf/tuning/train_enh_mdc.yaml
+  - Pretrained model: https://huggingface.co/Yulinfeng/wsj0_2mix_enh_train_enh_mdc_raw_valid.si_snr.ave
+
+|dataset|PESQ|STOI|SAR|SDR|SIR|SI_SNR|
+|---|---|---|---|---|---|---|
+|enhanced_cv_min_8k|2.20|0.84|9.62|8.57|17.27|8.03|
+|enhanced_tt_min_8k|2.18|0.85|9.56|8.50|17.28|7.97|
+
+<!-- Generated by ./scripts/utils/show_enh_score.sh -->
+# RESULTS
+## Environments
+- date: `Thu Mar  3 14:33:32 CST 2022`
+- python version: `3.8.10 (default, May 19 2021, 18:05:58)  [GCC 7.3.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.5.1+cu101`
+- Git hash: `ec1acec03d109f06d829b80862e0388f7234d0d1`
+  - Commit date: `Fri Feb 25 14:12:45 2022 +0800`
+
+
+## enh_train_enh_dan_tf_raw
+
+  - config: conf/tuning/train_enh_dan_tf.yaml
+  - Pretrained model: https://huggingface.co/Yulinfeng/wsj0_2mix_enh_train_enh_dan_tf_raw_valid.si_snr.ave
+
+|dataset|PESQ|STOI|SAR|SDR|SIR|SI_SNR|
+|---|---|---|---|---|---|---|
+|enhanced_cv_min_8k|2.68|0.88|12.28|11.01|18.03|10.48|
+|enhanced_tt_min_8k|2.68|0.89|12.10|10.84|17.98|10.30|
+- date: `Thu Mar  3 14:29:20 CST 2022`
+- python version: `3.8.12 (default, Oct 12 2021, 13:49:34)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.10.2+cu102`
+- Git hash: `9c24b3adddbde3402530080cb58ae08a6f4dd642`
+  - Commit date: `Wed Feb 23 14:49:15 2022 -0500`
+
+
+## DC-CRN complex spectral mapping (SNR loss)
+
+config: conf/tuning/train_enh_dc_crn_mapping_snr.yaml
+
+|dataset|STOI|SAR|SDR|SIR|SI_SNR|
+|---|---|---|---|---|---|
+|enhanced_cv_min_8k|0.93|14.86|14.16|24.02|13.73|
+|enhanced_tt_min_8k|0.94|14.25|13.46|23.13|13.01|
+
+<!-- Generated by ./scripts/utils/show_enh_score.sh -->
+# RESULTS
+## Environments
+- date: `Thu Apr 14 09:47:05 UTC 2022`
+- python version: `3.8.12 (default, Oct 12 2021, 13:49:34)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.10.1+cu111`
+- Git hash: `9dbe4179b866b994f6914ef52ea7483696d22760`
+  - Commit date: `Wed Mar 16 13:25:26 2022 +0000`
+
+
+## svoice (multi-layer Si-SNR loss)
+
+ - config: conf/tuning/train_enh_svoice.yaml
+ - Pretrained model: https://huggingface.co/Zhaoheng/svoice_wsj0_2mix
+
+|dataset|STOI|SAR|SDR|SIR|SI_SNR|
+|---|---|---|---|---|---|
+|enhanced_cv_min_8k|0.97|21.44|20.98|32.21|20.67|
+|enhanced_tt_min_8k|0.98|21.41|20.96|32.27|20.66|
+
diff --git a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_dan_tf.yaml b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_dan_tf.yaml
new file mode 100644
index 00000000000..d1995a99894
--- /dev/null
+++ b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_dan_tf.yaml
@@ -0,0 +1,65 @@
+optim: adam
+init: xavier_uniform
+max_epoch: 100
+batch_type: folded
+batch_size: 8
+iterator_type: chunk
+chunk_length: 32000
+num_workers: 4
+optim_conf:
+    lr: 1.0e-04
+    eps: 1.0e-08
+    weight_decay: 1.0e-7
+patience: 10
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: reducelronplateau
+scheduler_conf:
+    mode: min
+    factor: 0.7
+    patience: 1
+
+# A list for criterions
+# The overlall loss in the multi-task learning will be:
+# loss = weight_1 * loss_1 + ... + weight_N * loss_N
+# The default `weight` for each sub-loss is 1.0
+criterions: 
+  # The first criterion
+  - name: mse
+    conf:
+      compute_on_mask: False
+      mask_type: PSM
+    # the wrapper for the current criterion
+    # PIT is widely used in the speech separation task
+    wrapper: pit
+    wrapper_conf:
+      weight: 1.0
+
+encoder: stft
+encoder_conf:
+    n_fft: 256
+    hop_length: 64
+decoder: stft
+decoder_conf:
+    n_fft: 256
+    hop_length: 64
+separator: dan
+separator_conf:
+    rnn_type: blstm
+    num_spk: 2
+    nonlinear: tanh
+    layer: 4
+    unit: 600
+    dropout: 0.1
+    emb_D: 20
+
+
diff --git a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_dc_crn_mapping_snr.yaml b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_dc_crn_mapping_snr.yaml
new file mode 100644
index 00000000000..64cc661070a
--- /dev/null
+++ b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_dc_crn_mapping_snr.yaml
@@ -0,0 +1,66 @@
+init: xavier_uniform
+max_epoch: 200
+batch_type: folded
+batch_size:  16
+iterator_type: chunk
+chunk_length: 32000
+num_workers: 4
+optim: adam
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 1.0e-7
+    amsgrad: true
+patience: 10
+grad_clip: 5
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: steplr
+scheduler_conf:
+    step_size: 2
+    gamma: 0.98
+
+# A list for criterions
+# The overlall loss in the multi-task learning will be:
+# loss = weight_1 * loss_1 + ... + weight_N * loss_N
+# The default `weight` for each sub-loss is 1.0
+criterions: 
+  # The first criterion
+  - name: si_snr 
+    conf:
+      eps: 1.0e-7
+    # the wrapper for the current criterion
+    # PIT is widely used in the speech separation task
+    wrapper: pit
+    wrapper_conf:
+      weight: 1.0
+
+
+encoder: stft
+encoder_conf:
+    n_fft: 256
+    hop_length: 128
+decoder: stft
+decoder_conf:
+    n_fft: 256
+    hop_length: 128
+separator: dc_crn
+separator_conf:
+    num_spk: 2
+    input_channels: [2, 16, 32, 64, 128, 256]
+    enc_hid_channels: 8
+    enc_layers: 5
+    glstm_groups: 2
+    glstm_layers: 2
+    glstm_bidirectional: true
+    glstm_rearrange: false
+    mode: mapping
diff --git a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_dpcl.yaml b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_dpcl.yaml
new file mode 100644
index 00000000000..58a06679107
--- /dev/null
+++ b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_dpcl.yaml
@@ -0,0 +1,62 @@
+optim: adam
+init: xavier_uniform
+max_epoch: 100
+batch_type: folded
+batch_size:  8
+num_workers: 4
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 1.0e-7
+patience: 10
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: reducelronplateau
+scheduler_conf:
+    mode: min
+    factor: 0.7
+    patience: 1
+
+# A list for criterions
+# The overlall loss in the multi-task learning will be:
+# loss = weight_1 * loss_1 + ... + weight_N * loss_N
+# The default `weight` for each sub-loss is 1.0
+criterions: 
+  # The first criterion
+  - name: dpcl
+    conf:
+      loss_type: dpcl # "dpcl" or "mdc", "dpcl" means the origin loss in Deep Clustering and "mdc" means Manifold-Aware Deep Clustering
+    # the wrapper for the current criterion
+    # PIT is widely used in the speech separation task
+    wrapper: dpcl
+    wrapper_conf:
+      weight: 1.0
+
+encoder: stft
+encoder_conf:
+    n_fft: 256
+    hop_length: 128
+decoder: stft
+decoder_conf:
+    n_fft: 256
+    hop_length: 128
+separator: dpcl
+separator_conf:
+    rnn_type: blstm
+    num_spk: 2
+    nonlinear: relu
+    layer: 2
+    unit: 500
+    dropout: 0.1
+    emb_D: 40
+
+
diff --git a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_dpcl_e2e.yaml b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_dpcl_e2e.yaml
new file mode 100644
index 00000000000..aba37266183
--- /dev/null
+++ b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_dpcl_e2e.yaml
@@ -0,0 +1,66 @@
+optim: adam
+init: xavier_uniform
+max_epoch: 100
+batch_type: folded
+batch_size:  8
+num_workers: 4
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 1.0e-7
+patience: 10
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: reducelronplateau
+scheduler_conf:
+    mode: min
+    factor: 0.7
+    patience: 1
+
+# A list for criterions
+# The overlall loss in the multi-task learning will be:
+# loss = weight_1 * loss_1 + ... + weight_N * loss_N
+# The default `weight` for each sub-loss is 1.0
+criterions: 
+  # The first criterion
+  - name: mse
+    conf:
+      compute_on_mask: False
+      mask_type: PSM
+    # the wrapper for the current criterion
+    # PIT is widely used in the speech separation task
+    wrapper: pit
+    wrapper_conf:
+      weight: 1.0
+
+encoder: stft
+encoder_conf:
+    n_fft: 256
+    hop_length: 128
+decoder: stft
+decoder_conf:
+    n_fft: 256
+    hop_length: 128
+separator: dpcl_e2e
+separator_conf:
+    rnn_type: blstm
+    num_spk: 2
+    nonlinear: relu
+    layer: 2
+    unit: 500
+    dropout: 0.1
+    emb_D: 40
+    alpha: 5.0
+    max_iteration: 100
+    threshold: 1.0e-05
+
+
diff --git a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_mdc.yaml b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_mdc.yaml
new file mode 100644
index 00000000000..c093aca6944
--- /dev/null
+++ b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_mdc.yaml
@@ -0,0 +1,62 @@
+optim: adam
+init: xavier_uniform
+max_epoch: 100
+batch_type: folded
+batch_size:  8
+num_workers: 4
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 1.0e-7
+patience: 10
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: reducelronplateau
+scheduler_conf:
+    mode: min
+    factor: 0.7
+    patience: 1
+
+# A list for criterions
+# The overlall loss in the multi-task learning will be:
+# loss = weight_1 * loss_1 + ... + weight_N * loss_N
+# The default `weight` for each sub-loss is 1.0
+criterions: 
+  # The first criterion
+  - name: dpcl
+    conf:
+      loss_type: mdc # "dpcl" or "mdc", "dpcl" means the origin loss in Deep Clustering and "mdc" means Manifold-Aware Deep Clustering
+    # the wrapper for the current criterion
+    # PIT is widely used in the speech separation task
+    wrapper: dpcl
+    wrapper_conf:
+      weight: 1.0
+
+encoder: stft
+encoder_conf:
+    n_fft: 256
+    hop_length: 128
+decoder: stft
+decoder_conf:
+    n_fft: 256
+    hop_length: 128
+separator: dpcl
+separator_conf:
+    rnn_type: blstm
+    num_spk: 2
+    nonlinear: relu
+    layer: 2
+    unit: 500
+    dropout: 0.1
+    emb_D: 40
+
+
diff --git a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet_causal.yaml b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet_causal.yaml
new file mode 100644
index 00000000000..cb59580d9d8
--- /dev/null
+++ b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet_causal.yaml
@@ -0,0 +1,72 @@
+optim: adam
+init: xavier_uniform
+max_epoch: 150
+batch_type: folded
+batch_size: 8
+iterator_type: chunk
+chunk_length: 16000
+num_workers: 4
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 0
+patience: 4
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: reducelronplateau
+scheduler_conf:
+    mode: min
+    factor: 0.7
+    patience: 1
+
+encoder: conv
+encoder_conf:
+    channel: 64
+    kernel_size: 2
+    stride: 1
+decoder: conv
+decoder_conf:
+    channel: 64
+    kernel_size: 2
+    stride: 1
+separator: skim
+separator_conf:
+    causal: True
+    num_spk: 2
+    layer: 6
+    nonlinear: relu
+    unit: 128
+    segment_size: 250
+    dropout: 0.1
+    mem_type: hc 
+    seg_overlap: False
+
+# A list for criterions
+# The overlall loss in the multi-task learning will be:
+# loss = weight_1 * loss_1 + ... + weight_N * loss_N
+# The default `weight` for each sub-loss is 1.0
+criterions: 
+  # The first criterion
+  - name: si_snr 
+    conf:
+      eps: 1.0e-7
+    wrapper: pit
+    wrapper_conf:
+      weight: 1.0
+      independent_perm: True
+
+
+
+
+  
+    
+
diff --git a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet_noncausal.yaml b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet_noncausal.yaml
new file mode 100644
index 00000000000..0c338171b63
--- /dev/null
+++ b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet_noncausal.yaml
@@ -0,0 +1,72 @@
+optim: adam
+init: xavier_uniform
+max_epoch: 150
+batch_type: folded
+batch_size: 8
+iterator_type: chunk
+chunk_length: 16000
+num_workers: 4
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 0
+patience: 20
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: reducelronplateau
+scheduler_conf:
+    mode: min
+    factor: 0.7
+    patience: 1
+
+encoder: conv
+encoder_conf:
+    channel: 64
+    kernel_size: 2
+    stride: 1
+decoder: conv
+decoder_conf:
+    channel: 64
+    kernel_size: 2
+    stride: 1
+separator: skim
+separator_conf:
+    causal: False
+    num_spk: 2
+    layer: 6
+    nonlinear: relu
+    unit: 128
+    segment_size: 250
+    dropout: 0.1
+    mem_type: hc 
+    seg_overlap: True
+
+# A list for criterions
+# The overlall loss in the multi-task learning will be:
+# loss = weight_1 * loss_1 + ... + weight_N * loss_N
+# The default `weight` for each sub-loss is 1.0
+criterions: 
+  # The first criterion
+  - name: si_snr 
+    conf:
+      eps: 1.0e-7
+    wrapper: pit
+    wrapper_conf:
+      weight: 1.0
+      independent_perm: True
+
+
+
+
+  
+    
+
diff --git a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet_steplr.yaml b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet_steplr.yaml
new file mode 100644
index 00000000000..9b85b4f7944
--- /dev/null
+++ b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet_steplr.yaml
@@ -0,0 +1,71 @@
+optim: adam
+init: xavier_uniform
+max_epoch: 150
+batch_type: folded
+batch_size: 8
+iterator_type: chunk
+chunk_length: 16000
+num_workers: 4
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 0
+patience: 50
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: steplr
+scheduler_conf:
+    step_size: 2
+    gamma: 0.98
+
+encoder: conv
+encoder_conf:
+    channel: 64
+    kernel_size: 2
+    stride: 1
+decoder: conv
+decoder_conf:
+    channel: 64
+    kernel_size: 2
+    stride: 1
+separator: skim
+separator_conf:
+    causal: False
+    num_spk: 2
+    layer: 6
+    nonlinear: relu
+    unit: 128
+    segment_size: 250
+    dropout: 0.1
+    mem_type: hc 
+    seg_overlap: True
+
+# A list for criterions
+# The overlall loss in the multi-task learning will be:
+# loss = weight_1 * loss_1 + ... + weight_N * loss_N
+# The default `weight` for each sub-loss is 1.0
+criterions: 
+  # The first criterion
+  - name: si_snr 
+    conf:
+      eps: 1.0e-7
+    wrapper: pit
+    wrapper_conf:
+      weight: 1.0
+      independent_perm: True
+
+
+
+
+  
+    
+
diff --git a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_svoice.yaml b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_svoice.yaml
new file mode 100644
index 00000000000..8ee8dc1d529
--- /dev/null
+++ b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_svoice.yaml
@@ -0,0 +1,51 @@
+optim: adam
+init: xavier_uniform
+max_epoch: 150
+batch_type: folded
+batch_size: 8
+iterator_type: chunk
+chunk_length: 16000
+num_workers: 4
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 0
+patience: 20
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: reducelronplateau
+scheduler_conf:
+    mode: min
+    factor: 0.7
+    patience: 1
+
+encoder: same
+decoder: same
+separator: svoice
+separator_conf:
+    enc_dim: 128
+    kernel_size: 8
+    hidden_size: 128
+    num_spk: 2
+    num_layers: 6
+    segment_size: 128
+    input_normalize: False
+
+criterions: 
+  # The first criterion
+  - name: si_snr 
+    conf:
+      eps: 1.0e-7
+    wrapper: multilayer_pit
+    wrapper_conf:
+      weight: 1.0
+      independent_perm: True
diff --git a/egs2/zh_openslr38/asr1/README.md b/egs2/zh_openslr38/asr1/README.md
new file mode 100644
index 00000000000..e646b431596
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/README.md
@@ -0,0 +1,39 @@
+# Corpus
+**Free ST Chinese Mandarin Corpus**: a free Mandarin Chinese corpus collected by Surfingtech (www.surfing.ai). The dataset contains 102600 utterances from 855 speakers, for a total of 109.73 hours of speech. 
+
+Since all speakers have 120 utterances, we manually divide the data into train, dev, and test split with a ratio of 90-5-5 using speaker IDs, resulting in 769, 43, and 43 speakers in our train, dev, test split respectively. Utterances with the same speaker ID are kept in the same split.
+
+The original dataset contains duplicates sentences with the same transcript, but are spoken by different speakers. Although the waveforms are different for these duplicates, we still remove sentences in the test and development set that have duplicate transcripts in the training set, in order to eliminate any effect of training data leakage.
+
+Link: https://openslr.org/38
+
+# Results
+## Environments
+- python version: `3.9.10 | packaged by conda-forge | (main, Feb  1 2022, 21:24:11)  [GCC 9.4.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.10.1`
+- pretrained model: https://huggingface.co/espnet/zh_openslr38/blob/main/exp/asr_train_asr_conformer_raw_zh_char_sp/valid.acc.ave_10best.pth
+
+## Spectrum Features
+Code to reproduce:
+```./run.sh```
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_rnn_lm_lm_train_lm_transformer_zh_char_valid.loss.ave_asr_model_valid.acc.ave/dev|4322|46490|91.0|8.4|0.5|0.2|9.2|51.5|
+|decode_asr_rnn_lm_lm_train_lm_transformer_zh_char_valid.loss.ave_asr_model_valid.acc.ave/test|4167|45803|91.1|8.5|0.5|0.2|9.1|52.2|
+
+## HuBERT Self-Supervised Learning (SSLR)
+We provide the script to train with SSLR features via HuBERT. Due to the much longer training time with HuBERT, we only train for 24 epochs. The model does not show a lower CER over spectrum features, but training for longer may lead to improved results.
+
+Code to reproduce:
+```./local/run_sslr.sh```
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_zh_char_valid.loss.ave_asr_model_valid.acc.best/dev|4322|46490|90.8|8.6|0.6|0.2|9.4|51.9|
+|decode_asr_lm_lm_train_lm_zh_char_valid.loss.ave_asr_model_valid.acc.best/test|4167|45803|90.8|8.7|0.5|0.2|9.4|54.1|
diff --git a/egs2/zh_openslr38/asr1/asr.sh b/egs2/zh_openslr38/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/zh_openslr38/asr1/cmd.sh b/egs2/zh_openslr38/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/zh_openslr38/asr1/conf/decode_asr.yaml b/egs2/zh_openslr38/asr1/conf/decode_asr.yaml
new file mode 100644
index 00000000000..88fdbc20b91
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/conf/decode_asr.yaml
@@ -0,0 +1,6 @@
+beam_size: 20
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.6
+lm_weight: 0.3
diff --git a/egs2/zh_openslr38/asr1/conf/fbank.conf b/egs2/zh_openslr38/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/zh_openslr38/asr1/conf/pbs.conf b/egs2/zh_openslr38/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/zh_openslr38/asr1/conf/pitch.conf b/egs2/zh_openslr38/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/zh_openslr38/asr1/conf/queue.conf b/egs2/zh_openslr38/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/zh_openslr38/asr1/conf/slurm.conf b/egs2/zh_openslr38/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/zh_openslr38/asr1/conf/train_asr.yaml b/egs2/zh_openslr38/asr1/conf/train_asr.yaml
new file mode 100644
index 00000000000..98588892b1c
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/conf/train_asr.yaml
@@ -0,0 +1,76 @@
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    macaron_style: true
+    use_cnn_module: true
+    cnn_module_kernel: 15
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+# minibatch related
+batch_type: numel
+batch_bins: 4000000
+
+# optimization related
+accum_grad: 4
+grad_clip: 5
+max_epoch: 40
+val_scheduler_criterion:
+    - valid
+    - acc
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+   lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 30000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
\ No newline at end of file
diff --git a/egs2/zh_openslr38/asr1/conf/train_lm.yaml b/egs2/zh_openslr38/asr1/conf/train_lm.yaml
new file mode 100644
index 00000000000..6f12611bf06
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/conf/train_lm.yaml
@@ -0,0 +1,29 @@
+lm: transformer
+lm_conf:
+    pos_enc: null
+    embed_unit: 128
+    att_unit: 512
+    head: 8
+    unit: 2048
+    layer: 16
+    dropout_rate: 0.1
+
+# optimization related
+grad_clip: 5.0
+batch_type: numel
+batch_bins: 2000000
+accum_grad: 1
+max_epoch: 15  # 15epoch is enougth
+
+optim: adam
+optim_conf:
+   lr: 0.001
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 25000
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 10  # 10 is good.
\ No newline at end of file
diff --git a/egs2/zh_openslr38/asr1/conf/tuning/train_asr_sslr.yaml b/egs2/zh_openslr38/asr1/conf/tuning/train_asr_sslr.yaml
new file mode 100644
index 00000000000..03a410cded5
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/conf/tuning/train_asr_sslr.yaml
@@ -0,0 +1,89 @@
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    macaron_style: true
+    use_cnn_module: true
+    cnn_module_kernel: 15
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+# minibatch related
+batch_type: numel
+batch_bins: 4000000
+
+# optimization related
+accum_grad: 4
+grad_clip: 5
+max_epoch: 40
+val_scheduler_criterion:
+    - valid
+    - acc
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+   lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 30000
+
+frontend: s3prl
+frontend_conf:
+   frontend_conf:
+      upstream: hubert_large_ll60k  # Note: If the upstream is changed, please change the input_size in the preencoder.
+   download_dir: ./hub
+   multilayer_feature: True
+   
+preencoder: linear
+preencoder_conf:
+   input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+   output_size: 80
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/zh_openslr38/asr1/db.sh b/egs2/zh_openslr38/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/zh_openslr38/asr1/local/check_train_test_duplicate.py b/egs2/zh_openslr38/asr1/local/check_train_test_duplicate.py
new file mode 100644
index 00000000000..3c61d786c1d
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/local/check_train_test_duplicate.py
@@ -0,0 +1,61 @@
+from collections import Counter
+
+train_file = "data/train/text"
+
+train_lines = []
+with open(train_file) as f:
+    for line in f:
+        if not line:
+            continue
+        train_lines.append(line.split()[1])
+train_lines = set(train_lines)
+
+for test_name in ("test", "dev"):
+    test_file = f"data/{test_name}/text"
+
+    test_lines = []
+    test_uttids = []
+    with open(test_file) as f:
+        for line in f:
+            if not line:
+                continue
+            test_uttids.append(line.split()[0])
+            test_lines.append(line.split()[1])
+
+    count = 0
+    duplicate_uttids = []  # duplicate ids in the test file
+    for t, uttid in zip(test_lines, test_uttids):
+        if t in train_lines:
+            duplicate_uttids.append(uttid)
+            count += 1
+    duplicate_uttids = set(duplicate_uttids)
+    print(count, "duplicates in", test_name)
+
+    # if input("continue? [y/n]") == 'y':
+    # remove all instances of duplicate uttids in: spk2utt, text, utt2spk, wav.scp
+    with open(f"data/{test_name}/spk2utt", "r") as f:
+        # replace all uttid with empty string
+        text = f.read()
+        for uttid in duplicate_uttids:
+            text = text.replace(" " + uttid, "")
+        for line in text.split("\n"):
+            if not line:
+                continue
+            if len(line.strip().split(" ")) < 2:
+                print(f"removing {line} from spk2utt")
+                text = text.replace(line + "\n", "")
+    with open(f"data/{test_name}/spk2utt", "w") as f:
+        f.write(text)
+
+    for name in ("text", "utt2spk", "wav.scp"):
+        with open(f"data/{test_name}/{name}", "r") as f:
+            # remove all lines that contain ids that correspond to duplicate sentences
+            out_lines = []
+            for line in f:
+                if not line.split()[0] in duplicate_uttids:
+                    out_lines.append(line.strip())
+        with open(f"data/{test_name}/{name}", "w") as f:
+            f.write("\n".join(out_lines))
+            f.write("\n")
+    # else:
+    #     print("ok.")
diff --git a/egs2/zh_openslr38/asr1/local/data.sh b/egs2/zh_openslr38/asr1/local/data.sh
new file mode 100755
index 00000000000..a8ae818556d
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/local/data.sh
@@ -0,0 +1,84 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+help_message=$(cat << EOF
+Usage: $0
+
+Options:
+    --remove_archive (bool): true or false
+      With remove_archive=True, the archives will be removed after being successfully downloaded and un-tarred.
+EOF
+)
+SECONDS=0
+
+# Data preparation related
+data_url=www.openslr.org/resources/38
+remove_archive=false
+download_opt=
+
+log "$0 $*"
+
+
+. ./utils/parse_options.sh
+
+. ./db.sh
+. ./path.sh
+. ./cmd.sh
+
+
+if [ $# -gt 1 ]; then
+  log "${help_message}"
+  exit 2
+fi
+
+if "$remove_archive"; then
+  download_opt="--remove-archive"
+fi
+
+if [ -z "${ST_CMDS}" ]; then
+  log "Error: \$ST_CMDS is not set in db.sh."
+  exit 2
+fi
+
+
+log "Download data to ${ST_CMDS}"
+if [ ! -d "${ST_CMDS}" ]; then
+    mkdir -p "${ST_CMDS}"
+fi
+# To absolute path
+ST_CMDS=$(cd ${ST_CMDS}; pwd)
+
+echo local/data_download.sh ${download_opt} "${ST_CMDS}" "${data_url}" ST-CMDS-20170001_1-OS.tar.gz
+local/data_download.sh ${download_opt} "${ST_CMDS}" "${data_url}" ST-CMDS-20170001_1-OS.tar.gz
+
+log "Data Preparation"
+train_dir=data/train
+dev_dir=data/dev
+test_dir=data/test
+
+mkdir -p $train_dir
+mkdir -p $dev_dir
+mkdir -p $test_dir
+
+python3 local/data_split.py ${ST_CMDS}/ST-CMDS-20170001_1-OS
+
+for dir in $train_dir $dev_dir $test_dir; do
+  utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
+done
+
+python3 local/check_train_test_duplicate.py
+
+# validate formats
+utils/validate_data_dir.sh --no-feats data/train
+utils/validate_data_dir.sh --no-feats data/dev
+utils/validate_data_dir.sh --no-feats data/test
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
\ No newline at end of file
diff --git a/egs2/zh_openslr38/asr1/local/data_download.sh b/egs2/zh_openslr38/asr1/local/data_download.sh
new file mode 100755
index 00000000000..27edd73b27d
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/local/data_download.sh
@@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+
+remove_archive=false
+
+if [ "$1" == --remove-archive ]; then
+  remove_archive=true
+  shift
+fi
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [--remove-archive] <data-base> <url-base>"
+  echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/38"
+  echo "With --remove-archive it will remove the archive after successfully un-tarring it."
+fi
+
+data=$1
+url=$2
+
+if [ ! -d "$data" ]; then
+  echo "$0: no such directory $data"
+  exit 1;
+fi
+
+if [ -z "$url" ]; then
+  echo "$0: empty URL base."
+  exit 1;
+fi
+
+set -euo pipefail
+
+cwd=$(pwd)
+if [ ! -e "${data}/.complete" ]; then
+    mkdir -p "${data}"
+    cd "${data}" || exit 1;
+    wget $url/ST-CMDS-20170001_1-OS.tar.gz
+    tar xf ST-CMDS-20170001_1-OS.tar.gz
+
+    if $remove_archive; then
+        echo "$0: removing $data/ST-CMDS-20170001_1-OS.tar.gz file since --remove-archive option was supplied."
+        rm $data/ST-CMDS-20170001_1-OS.tar.gz
+    fi
+
+    cd "${cwd}" || exit 1;
+    echo "$0: Successfully downloaded and un-tarred $data/ST-CMDS-20170001_1-OS.tar.gz"
+    touch ${data}/.complete
+else
+    echo "$0: Already exists. Skip download."
+fi
+
+exit 0;
diff --git a/egs2/zh_openslr38/asr1/local/data_split.py b/egs2/zh_openslr38/asr1/local/data_split.py
new file mode 100644
index 00000000000..df952d304cd
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/local/data_split.py
@@ -0,0 +1,107 @@
+"""
+Split data to train, dev, test
+"""
+import sys
+import os
+from collections import defaultdict
+import random
+
+train_size = 0.9
+random.seed(1)
+
+data_dir = sys.argv[1]  # ST-CMDS-20170001_1-OS
+
+# create speaker id dictionary
+d = defaultdict(list)
+for fn in os.listdir(data_dir):
+    if not fn.endswith(".wav"):
+        continue
+    # 20170001P00001A0001.wav
+    prefix, s = fn.split("P")
+    try:
+        speaker, s = s.split("A")
+        letter = "A"
+    except ValueError:
+        speaker, s = s.split("I")
+        letter = "I"
+    utt, _ = s.split(".")
+    d[speaker + letter].append(utt)
+
+speaker_ids = list(d.keys())
+random.shuffle(speaker_ids)
+
+num_speakers = len(speaker_ids)
+assert (
+    num_speakers == 855
+), "Number of speakers should be 855 in Free ST Chinese Mandarin Corpus."
+
+num_train = int(train_size * num_speakers)
+num_test = int((num_speakers - num_train) / 2)
+
+train_speakers = speaker_ids[:num_train]
+dev_speakers = speaker_ids[num_train:-num_test]
+test_speakers = speaker_ids[-num_test:]
+
+print(
+    f"# train: {num_train}, # dev:{num_speakers-num_train-num_test}, # test:{num_test}"
+)
+
+
+def get_transcription(spk_id, utt_id):
+    text_fn = get_text_filename(spk_id, utt_id)
+    with open(text_fn) as f:
+        lines = f.readlines()
+    assert len(lines) == 1, f"More than one line in transription file:{text_fn}"
+    return lines[0]
+
+
+def get_text_filename(spk_id, utt_id):
+    return f"{data_dir}/20170001P{spk_id}{utt_id}.txt"
+
+
+def get_wav_filename(spk_id, utt_id):
+    return f"{data_dir}/20170001P{spk_id}{utt_id}.wav"
+
+
+def create_files(speakers, directory):
+    text_lines, scp_lines, utt2spk_lines = [], [], []
+    for spk_id in speakers:
+        for utt_id in d[spk_id]:
+            # add spk_id in front to make utt_id unique
+            unique_utt_id = spk_id + utt_id
+
+            transcription = get_transcription(spk_id, utt_id)
+            text_lines.append(f"{unique_utt_id} {transcription}\n")
+
+            wav_file_path = get_wav_filename(spk_id, utt_id)
+            scp_lines.append(f"{unique_utt_id} {wav_file_path}\n")
+
+            utt2spk_lines.append(f"{unique_utt_id} {spk_id}\n")
+
+    # sort
+    text_lines.sort()
+    scp_lines.sort()
+    utt2spk_lines.sort()
+
+    # write to file
+    with open(f"{directory}/text", "w+") as text_file:
+        text_file.writelines(text_lines)
+
+    with open(f"{directory}/wav.scp", "w+") as scp_file:
+        scp_file.writelines(scp_lines)
+
+    with open(f"{directory}/utt2spk", "w+") as utt2spk_file:
+        utt2spk_file.writelines(utt2spk_lines)
+
+
+print("Creating files for train...", end="")
+create_files(train_speakers, "data/train")
+print("Done.")
+
+print("Creating files for dev...", end="")
+create_files(dev_speakers, "data/dev")
+print("Done.")
+
+print("Creating files for test...", end="")
+create_files(test_speakers, "data/test")
+print("Done.")
diff --git a/egs2/zh_openslr38/asr1/local/path.sh b/egs2/zh_openslr38/asr1/local/path.sh
new file mode 100755
index 00000000000..cd186777d50
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/local/path.sh
@@ -0,0 +1 @@
+MAIN_ROOT=$PWD/../../..
\ No newline at end of file
diff --git a/egs2/zh_openslr38/asr1/local/run_sslr.sh b/egs2/zh_openslr38/asr1/local/run_sslr.sh
new file mode 100755
index 00000000000..0f924c725c5
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/local/run_sslr.sh
@@ -0,0 +1,44 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train
+valid_set=dev
+test_sets="dev test"
+
+asr_config=conf/tuning/train_asr_sslr.yaml
+inference_config=conf/decode_asr.yaml
+
+lm_config=conf/train_lm.yaml
+use_lm=true
+use_wordlm=false
+
+# speed perturbation related
+# (train_set will be "${train_set}_sp" if speed_perturb_factors is specified)
+speed_perturb_factors="0.9 1.0 1.1"
+
+./asr.sh                                               \
+    --lang zh                                          \
+    --audio_format wav                                 \
+    --feats_type raw                                   \
+    --token_type char                                  \
+    --use_lm ${use_lm}                                 \
+    --use_word_lm ${use_wordlm}                        \
+    --lm_config "${lm_config}"                         \
+    --asr_config "${asr_config}"                       \
+    --inference_config "${inference_config}"           \
+    --train_set "${train_set}"                         \
+    --valid_set "${valid_set}"                         \
+    --test_sets "${test_sets}"                         \
+    --speed_perturb_factors "${speed_perturb_factors}" \
+    --asr_speech_fold_length 512 \
+    --asr_text_fold_length 150 \
+    --lm_fold_length 150 \
+    --lm_train_text "data/${train_set}/text" "$@" \
+    --feats_normalize uttmvn \
+    --nj 1 \
+    --inference_asr_model valid.acc.best.pth \
+    --gpu_inference true
diff --git a/egs2/zh_openslr38/asr1/path.sh b/egs2/zh_openslr38/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/zh_openslr38/asr1/pyscripts b/egs2/zh_openslr38/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/zh_openslr38/asr1/run.sh b/egs2/zh_openslr38/asr1/run.sh
new file mode 100755
index 00000000000..8d443a09702
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/run.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train
+valid_set=dev
+test_sets="dev test"
+
+asr_config=conf/train_asr.yaml
+inference_config=conf/decode_asr.yaml
+
+lm_config=conf/train_lm.yaml
+use_lm=true
+use_wordlm=false
+
+# speed perturbation related
+# (train_set will be "${train_set}_sp" if speed_perturb_factors is specified)
+speed_perturb_factors="0.9 1.0 1.1"
+
+./asr.sh                                               \
+    --lang zh                                          \
+    --audio_format wav                                 \
+    --feats_type raw                                   \
+    --token_type char                                  \
+    --use_lm ${use_lm}                                 \
+    --use_word_lm ${use_wordlm}                        \
+    --lm_config "${lm_config}"                         \
+    --asr_config "${asr_config}"                       \
+    --inference_config "${inference_config}"           \
+    --train_set "${train_set}"                         \
+    --valid_set "${valid_set}"                         \
+    --test_sets "${test_sets}"                         \
+    --speed_perturb_factors "${speed_perturb_factors}" \
+    --asr_speech_fold_length 512 \
+    --asr_text_fold_length 150 \
+    --lm_fold_length 150 \
+    --lm_train_text "data/${train_set}/text" "$@"
diff --git a/egs2/zh_openslr38/asr1/scripts b/egs2/zh_openslr38/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/zh_openslr38/asr1/steps b/egs2/zh_openslr38/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/zh_openslr38/asr1/utils b/egs2/zh_openslr38/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/espnet/asr/asr_utils.py b/espnet/asr/asr_utils.py
index 99f6c8c84f2..e8c7387ae4b 100644
--- a/espnet/asr/asr_utils.py
+++ b/espnet/asr/asr_utils.py
@@ -490,7 +490,7 @@ def draw_ctc_plot(self, ctc_prob):
                     )
                 else:
                     plt.plot(times_probs, ctc_prob[:, idx])
-            plt.xlabel(u"Input [frame]", fontsize=12)
+            plt.xlabel("Input [frame]", fontsize=12)
             plt.ylabel("Posteriors", fontsize=12)
             plt.xticks(list(range(0, int(n_frames) + 1, 10)))
             plt.yticks(list(range(0, 2, 1)))
@@ -666,7 +666,7 @@ def add_gradient_noise(model, iteration, duration=100, eta=1.0, scale_factor=0.5
         scale_factor (float) {0.55}: The scale of `sigma`.
     """
     interval = (iteration // duration) + 1
-    sigma = eta / interval ** scale_factor
+    sigma = eta / interval**scale_factor
     for param in model.parameters():
         if param.grad is not None:
             _shape = param.grad.size()
diff --git a/espnet/nets/chainer_backend/e2e_asr_transformer.py b/espnet/nets/chainer_backend/e2e_asr_transformer.py
index dc945042103..07c63d23697 100644
--- a/espnet/nets/chainer_backend/e2e_asr_transformer.py
+++ b/espnet/nets/chainer_backend/e2e_asr_transformer.py
@@ -154,7 +154,7 @@ def __init__(self, idim, odim, args, ignore_id=-1, flag_return=True):
         self.char_list = args.char_list
         self.space = args.sym_space
         self.blank = args.sym_blank
-        self.scale_emb = args.adim ** 0.5
+        self.scale_emb = args.adim**0.5
         self.sos = odim - 1
         self.eos = odim - 1
         self.subsample = get_subsample(args, mode="asr", arch="transformer")
diff --git a/espnet/nets/pytorch_backend/conformer/contextual_block_encoder_layer.py b/espnet/nets/pytorch_backend/conformer/contextual_block_encoder_layer.py
index 91cf0b068ab..6f02e5ef151 100644
--- a/espnet/nets/pytorch_backend/conformer/contextual_block_encoder_layer.py
+++ b/espnet/nets/pytorch_backend/conformer/contextual_block_encoder_layer.py
@@ -191,7 +191,7 @@ def forward_train(
             next_ctx[:, 0, layer_idx, :] = x[:, 0, -1, :]
             next_ctx[:, 1:, layer_idx, :] = x[:, 0:-1, -1, :]
 
-        return x, mask, False, next_ctx, next_ctx, layer_idx
+        return x, mask, False, next_ctx, next_ctx, False, layer_idx
 
     def forward_infer(
         self,
diff --git a/espnet/nets/pytorch_backend/e2e_asr_mix.py b/espnet/nets/pytorch_backend/e2e_asr_mix.py
index 56265d454bc..377aabe5162 100644
--- a/espnet/nets/pytorch_backend/e2e_asr_mix.py
+++ b/espnet/nets/pytorch_backend/e2e_asr_mix.py
@@ -323,7 +323,7 @@ def forward(self, xs_pad, ilens, ys_pad):
                             hlens[i // self.num_spkrs],
                             ys_pad[i % self.num_spkrs],
                         )
-                        for i in range(self.num_spkrs ** 2)
+                        for i in range(self.num_spkrs**2)
                     ],
                     dim=1,
                 )  # (B, num_spkrs^2)
@@ -441,13 +441,13 @@ def forward(self, xs_pad, ilens, ys_pad):
                     editdistance.eval(
                         hyp_words[ns // self.num_spkrs], ref_words[ns % self.num_spkrs]
                     )
-                    for ns in range(self.num_spkrs ** 2)
+                    for ns in range(self.num_spkrs**2)
                 ]  # h1r1,h1r2,h2r1,h2r2
                 tmp_char_ed = [
                     editdistance.eval(
                         hyp_chars[ns // self.num_spkrs], ref_chars[ns % self.num_spkrs]
                     )
-                    for ns in range(self.num_spkrs ** 2)
+                    for ns in range(self.num_spkrs**2)
                 ]  # h1r1,h1r2,h2r1,h2r2
 
                 word_eds.append(self.pit.min_pit_sample(torch.tensor(tmp_word_ed))[0])
@@ -676,7 +676,7 @@ def calculate_all_attentions(self, xs_pad, ilens, ys_pad):
                             hlens[i // self.num_spkrs],
                             ys_pad[i % self.num_spkrs],
                         )
-                        for i in range(self.num_spkrs ** 2)
+                        for i in range(self.num_spkrs**2)
                     ],
                     1,
                 )  # (B, num_spkrs^2)
diff --git a/espnet/nets/pytorch_backend/e2e_asr_mix_transformer.py b/espnet/nets/pytorch_backend/e2e_asr_mix_transformer.py
index e43d7ab9fbb..4622e9214ae 100644
--- a/espnet/nets/pytorch_backend/e2e_asr_mix_transformer.py
+++ b/espnet/nets/pytorch_backend/e2e_asr_mix_transformer.py
@@ -124,7 +124,7 @@ def forward(self, xs_pad, ilens, ys_pad):
                     hs_len[i // self.num_spkrs],
                     ys_pad[i % self.num_spkrs],
                 )
-                for i in range(self.num_spkrs ** 2)
+                for i in range(self.num_spkrs**2)
             ],
             dim=1,
         )  # (B, num_spkrs^2)
diff --git a/espnet/nets/pytorch_backend/e2e_mt_transformer.py b/espnet/nets/pytorch_backend/e2e_mt_transformer.py
index 24c3fc556f8..5e4b9bb70e1 100644
--- a/espnet/nets/pytorch_backend/e2e_mt_transformer.py
+++ b/espnet/nets/pytorch_backend/e2e_mt_transformer.py
@@ -141,11 +141,11 @@ def reset_parameters(self, args):
         """Initialize parameters."""
         initialize(self, args.transformer_init)
         torch.nn.init.normal_(
-            self.encoder.embed[0].weight, mean=0, std=args.adim ** -0.5
+            self.encoder.embed[0].weight, mean=0, std=args.adim**-0.5
         )
         torch.nn.init.constant_(self.encoder.embed[0].weight[self.pad], 0)
         torch.nn.init.normal_(
-            self.decoder.embed[0].weight, mean=0, std=args.adim ** -0.5
+            self.decoder.embed[0].weight, mean=0, std=args.adim**-0.5
         )
         torch.nn.init.constant_(self.decoder.embed[0].weight[self.pad], 0)
 
diff --git a/espnet/nets/pytorch_backend/e2e_st_transformer.py b/espnet/nets/pytorch_backend/e2e_st_transformer.py
index f889ccda20c..8c6406cb9ee 100644
--- a/espnet/nets/pytorch_backend/e2e_st_transformer.py
+++ b/espnet/nets/pytorch_backend/e2e_st_transformer.py
@@ -185,7 +185,7 @@ def reset_parameters(self, args):
         initialize(self, args.transformer_init)
         if self.mt_weight > 0:
             torch.nn.init.normal_(
-                self.encoder_mt.embed[0].weight, mean=0, std=args.adim ** -0.5
+                self.encoder_mt.embed[0].weight, mean=0, std=args.adim**-0.5
             )
             torch.nn.init.constant_(self.encoder_mt.embed[0].weight[self.pad], 0)
 
diff --git a/espnet/nets/pytorch_backend/e2e_tts_tacotron2.py b/espnet/nets/pytorch_backend/e2e_tts_tacotron2.py
index c8287b4c454..2e543d932e7 100644
--- a/espnet/nets/pytorch_backend/e2e_tts_tacotron2.py
+++ b/espnet/nets/pytorch_backend/e2e_tts_tacotron2.py
@@ -121,7 +121,7 @@ def _make_guided_attention_mask(ilen, olen, sigma):
         grid_x, grid_y = torch.meshgrid(torch.arange(olen), torch.arange(ilen))
         grid_x, grid_y = grid_x.float().to(olen.device), grid_y.float().to(ilen.device)
         return 1.0 - torch.exp(
-            -((grid_y / ilen - grid_x / olen) ** 2) / (2 * (sigma ** 2))
+            -((grid_y / ilen - grid_x / olen) ** 2) / (2 * (sigma**2))
         )
 
     @staticmethod
diff --git a/espnet/nets/pytorch_backend/frontends/dnn_beamformer.py b/espnet/nets/pytorch_backend/frontends/dnn_beamformer.py
index 07d719fc4fa..1495c81a40d 100644
--- a/espnet/nets/pytorch_backend/frontends/dnn_beamformer.py
+++ b/espnet/nets/pytorch_backend/frontends/dnn_beamformer.py
@@ -162,7 +162,7 @@ def forward(
         psd = (psd.sum(dim=-1) / (C - 1)).transpose(-1, -2)
 
         # Calculate amplitude
-        psd_feat = (psd.real ** 2 + psd.imag ** 2) ** 0.5
+        psd_feat = (psd.real**2 + psd.imag**2) ** 0.5
 
         # (B, C, F) -> (B, C, F2)
         mlp_psd = self.mlp_psd(psd_feat)
diff --git a/espnet/nets/pytorch_backend/frontends/dnn_wpe.py b/espnet/nets/pytorch_backend/frontends/dnn_wpe.py
index 33ccd11c71a..8bfe599d2f7 100644
--- a/espnet/nets/pytorch_backend/frontends/dnn_wpe.py
+++ b/espnet/nets/pytorch_backend/frontends/dnn_wpe.py
@@ -62,7 +62,7 @@ def forward(
 
         for i in range(self.iterations):
             # Calculate power: (..., C, T)
-            power = enhanced.real ** 2 + enhanced.imag ** 2
+            power = enhanced.real**2 + enhanced.imag**2
             if i == 0 and self.use_dnn_mask:
                 # mask: (B, F, C, T)
                 (mask,), _ = self.mask_est(enhanced, ilens)
diff --git a/espnet/nets/pytorch_backend/frontends/feature_transform.py b/espnet/nets/pytorch_backend/frontends/feature_transform.py
index 700f63fdd08..53915d28815 100644
--- a/espnet/nets/pytorch_backend/frontends/feature_transform.py
+++ b/espnet/nets/pytorch_backend/frontends/feature_transform.py
@@ -64,7 +64,7 @@ def forward(
             h = x
 
         # h: ComplexTensor(B, T, F) -> torch.Tensor(B, T, F)
-        h = h.real ** 2 + h.imag ** 2
+        h = h.real**2 + h.imag**2
 
         h, _ = self.logmel(h, ilens)
         if self.stats_file is not None:
diff --git a/espnet/nets/pytorch_backend/frontends/mask_estimator.py b/espnet/nets/pytorch_backend/frontends/mask_estimator.py
index 48aaf0a1df1..861527c7a90 100644
--- a/espnet/nets/pytorch_backend/frontends/mask_estimator.py
+++ b/espnet/nets/pytorch_backend/frontends/mask_estimator.py
@@ -46,7 +46,7 @@ def forward(
         xs = xs.permute(0, 2, 3, 1)
 
         # Calculate amplitude: (B, C, T, F) -> (B, C, T, F)
-        xs = (xs.real ** 2 + xs.imag ** 2) ** 0.5
+        xs = (xs.real**2 + xs.imag**2) ** 0.5
         # xs: (B, C, T, F) -> xs: (B * C, T, F)
         xs = xs.contiguous().view(-1, xs.size(-2), xs.size(-1))
         # ilens: (B,) -> ilens_: (B * C)
diff --git a/espnet/nets/pytorch_backend/transducer/blocks.py b/espnet/nets/pytorch_backend/transducer/blocks.py
index 7200ba4c0ad..86abc21e9a8 100644
--- a/espnet/nets/pytorch_backend/transducer/blocks.py
+++ b/espnet/nets/pytorch_backend/transducer/blocks.py
@@ -369,8 +369,12 @@ def build_conformer_block(
     pos_dropout_rate = block.get("pos-dropout-rate", 0.0)
     att_dropout_rate = block.get("att-dropout-rate", 0.0)
 
+    macaron_style = block["macaron_style"]
+    use_conv_mod = block["use_conv_mod"]
+
     if pw_layer_type == "linear":
-        pw_layer = PositionwiseFeedForward(
+        pw_layer = PositionwiseFeedForward
+        pw_layer_args = (
             d_hidden,
             d_ff,
             pos_dropout_rate,
@@ -379,33 +383,29 @@ def build_conformer_block(
     else:
         raise NotImplementedError("Conformer block only supports linear yet.")
 
-    macaron_net = (
-        PositionwiseFeedForward(
+    if macaron_style:
+        macaron_net = PositionwiseFeedForward
+        macaron_net_args = (
             d_hidden,
             d_ff,
             pos_dropout_rate,
             get_activation(pw_activation_type),
         )
-        if block["macaron_style"]
-        else None
-    )
 
-    conv_mod = (
-        ConvolutionModule(
+    if use_conv_mod:
+        conv_mod = ConvolutionModule
+        conv_mod_args = (
             d_hidden,
             block["conv_mod_kernel"],
             get_activation(conv_mod_activation_type),
         )
-        if block["use_conv_mod"]
-        else None
-    )
 
     return lambda: ConformerEncoderLayer(
         d_hidden,
         self_attn_class(block["heads"], d_hidden, att_dropout_rate),
-        pw_layer,
-        macaron_net,
-        conv_mod,
+        pw_layer(*pw_layer_args),
+        macaron_net(*macaron_net_args) if macaron_style else None,
+        conv_mod(*conv_mod_args) if use_conv_mod else None,
         dropout_rate,
     )
 
diff --git a/espnet/nets/pytorch_backend/transformer/contextual_block_encoder_layer.py b/espnet/nets/pytorch_backend/transformer/contextual_block_encoder_layer.py
index 9e78e605cf2..16957e99820 100644
--- a/espnet/nets/pytorch_backend/transformer/contextual_block_encoder_layer.py
+++ b/espnet/nets/pytorch_backend/transformer/contextual_block_encoder_layer.py
@@ -156,7 +156,7 @@ def forward_train(
             next_ctx[:, 0, layer_idx, :] = x[:, 0, -1, :]
             next_ctx[:, 1:, layer_idx, :] = x[:, 0:-1, -1, :]
 
-        return x, mask, False, next_ctx, next_ctx, layer_idx
+        return x, mask, False, next_ctx, next_ctx, False, layer_idx
 
     def forward_infer(
         self,
diff --git a/espnet/nets/pytorch_backend/transformer/embedding.py b/espnet/nets/pytorch_backend/transformer/embedding.py
index 8fe8d5b4d8c..17a39fddec4 100644
--- a/espnet/nets/pytorch_backend/transformer/embedding.py
+++ b/espnet/nets/pytorch_backend/transformer/embedding.py
@@ -127,6 +127,95 @@ def forward(self, x):
         return self.dropout(x)
 
 
+class LearnableFourierPosEnc(torch.nn.Module):
+    """Learnable Fourier Features for Positional Encoding.
+
+    See https://arxiv.org/pdf/2106.02795.pdf
+
+    Args:
+        d_model (int): Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
+        gamma (float): init parameter for the positional kernel variance
+            see https://arxiv.org/pdf/2106.02795.pdf.
+        apply_scaling (bool): Whether to scale the input before adding the pos encoding.
+        hidden_dim (int): if not None, we modulate the pos encodings with
+            an MLP whose hidden layer has hidden_dim neurons.
+    """
+
+    def __init__(
+        self,
+        d_model,
+        dropout_rate=0.0,
+        max_len=5000,
+        gamma=1.0,
+        apply_scaling=False,
+        hidden_dim=None,
+    ):
+        """Initialize class."""
+        super(LearnableFourierPosEnc, self).__init__()
+
+        self.d_model = d_model
+
+        if apply_scaling:
+            self.xscale = math.sqrt(self.d_model)
+        else:
+            self.xscale = 1.0
+
+        self.dropout = torch.nn.Dropout(dropout_rate)
+        self.max_len = max_len
+
+        self.gamma = gamma
+        if self.gamma is None:
+            self.gamma = self.d_model // 2
+
+        assert (
+            d_model % 2 == 0
+        ), "d_model should be divisible by two in order to use this layer."
+        self.w_r = torch.nn.Parameter(torch.empty(1, d_model // 2))
+        self._reset()  # init the weights
+
+        self.hidden_dim = hidden_dim
+        if self.hidden_dim is not None:
+            self.mlp = torch.nn.Sequential(
+                torch.nn.Linear(d_model, hidden_dim),
+                torch.nn.GELU(),
+                torch.nn.Linear(hidden_dim, d_model),
+            )
+
+    def _reset(self):
+        self.w_r.data = torch.normal(
+            0, (1 / math.sqrt(self.gamma)), (1, self.d_model // 2)
+        )
+
+    def extend_pe(self, x):
+        """Reset the positional encodings."""
+        position_v = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1).to(x)
+
+        cosine = torch.cos(torch.matmul(position_v, self.w_r))
+        sine = torch.sin(torch.matmul(position_v, self.w_r))
+        pos_enc = torch.cat((cosine, sine), -1)
+        pos_enc /= math.sqrt(self.d_model)
+
+        if self.hidden_dim is None:
+            return pos_enc.unsqueeze(0)
+        else:
+            return self.mlp(pos_enc.unsqueeze(0))
+
+    def forward(self, x: torch.Tensor):
+        """Add positional encoding.
+
+        Args:
+            x (torch.Tensor): Input tensor (batch, time, `*`).
+
+        Returns:
+            torch.Tensor: Encoded tensor (batch, time, `*`).
+        """
+        pe = self.extend_pe(x)
+        x = x * self.xscale + pe
+        return self.dropout(x)
+
+
 class LegacyRelPositionalEncoding(PositionalEncoding):
     """Relative positional encoding module (old version).
 
diff --git a/espnet/nets/pytorch_backend/transformer/encoder.py b/espnet/nets/pytorch_backend/transformer/encoder.py
index 3d8aa05aae3..508bf1aa7a7 100644
--- a/espnet/nets/pytorch_backend/transformer/encoder.py
+++ b/espnet/nets/pytorch_backend/transformer/encoder.py
@@ -50,12 +50,12 @@ class Encoder(torch.nn.Module):
         attention_dim (int): Dimension of attention.
         attention_heads (int): The number of heads of multi head attention.
         conv_wshare (int): The number of kernel of convolution. Only used in
-            self_attention_layer_type == "lightconv*" or "dynamiconv*".
+            selfattention_layer_type == "lightconv*" or "dynamiconv*".
         conv_kernel_length (Union[int, str]): Kernel size str of convolution
-            (e.g. 71_71_71_71_71_71). Only used in self_attention_layer_type
+            (e.g. 71_71_71_71_71_71). Only used in selfattention_layer_type
             == "lightconv*" or "dynamiconv*".
         conv_usebias (bool): Whether to use bias in convolution. Only used in
-            self_attention_layer_type == "lightconv*" or "dynamiconv*".
+            selfattention_layer_type == "lightconv*" or "dynamiconv*".
         linear_units (int): The number of units of position-wise feed forward.
         num_blocks (int): The number of decoder blocks.
         dropout_rate (float): Dropout rate.
diff --git a/espnet/nets/pytorch_backend/transformer/longformer_attention.py b/espnet/nets/pytorch_backend/transformer/longformer_attention.py
new file mode 100644
index 00000000000..82a54c801d1
--- /dev/null
+++ b/espnet/nets/pytorch_backend/transformer/longformer_attention.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright 2022 Roshan Sharma (Carnegie Mellon University)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Longformer based Local Attention Definition."""
+
+from longformer.longformer import LongformerConfig
+from longformer.longformer import LongformerSelfAttention
+from torch import nn
+
+
+class LongformerAttention(nn.Module):
+    """Longformer based Local Attention Definition."""
+
+    def __init__(self, config: LongformerConfig, layer_id: int):
+        """Compute Longformer based Self-Attention.
+
+        Args:
+            config : Longformer attention configuration
+            layer_id: Integer representing the layer index
+        """
+        super().__init__()
+        self.attention_window = config.attention_window[layer_id]
+        self.attention_layer = LongformerSelfAttention(config, layer_id=layer_id)
+        self.attention = None
+
+    def forward(self, query, key, value, mask):
+        """Compute Longformer Self-Attention with masking.
+
+        Expects `len(hidden_states)` to be multiple of `attention_window`.
+        Padding to `attention_window` happens in :meth:`encoder.forward`
+        to avoid redoing the padding on each layer.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            pos_emb (torch.Tensor): Positional embedding tensor
+                (#batch, 2*time1-1, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2).
+
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+        """
+        attention_mask = mask.int()
+        attention_mask[mask == 0] = -1
+        attention_mask[mask == 1] = 0
+        output, self.attention = self.attention_layer(
+            hidden_states=query,
+            attention_mask=attention_mask.unsqueeze(1),
+            head_mask=None,
+            output_attentions=True,
+        )
+        return output
diff --git a/espnet/nets/pytorch_backend/wavenet.py b/espnet/nets/pytorch_backend/wavenet.py
index a14870e5b5a..0539518342c 100644
--- a/espnet/nets/pytorch_backend/wavenet.py
+++ b/espnet/nets/pytorch_backend/wavenet.py
@@ -202,7 +202,7 @@ def __init__(
         self.upsampling_factor = upsampling_factor
 
         self.dilations = [
-            2 ** i for i in range(self.dilation_depth)
+            2**i for i in range(self.dilation_depth)
         ] * self.dilation_repeat
         self.receptive_field = (self.kernel_size - 1) * sum(self.dilations) + 1
 
diff --git a/espnet/scheduler/scheduler.py b/espnet/scheduler/scheduler.py
index 8d81368884c..f9221e7df8a 100644
--- a/espnet/scheduler/scheduler.py
+++ b/espnet/scheduler/scheduler.py
@@ -135,12 +135,12 @@ def _add_arguments(parser: _PrefixParser):
     def __init__(self, key, args):
         """Initialize class."""
         super().__init__(key, args)
-        self.normalize = 1 / (self.warmup * self.warmup ** -1.5)
+        self.normalize = 1 / (self.warmup * self.warmup**-1.5)
 
     def scale(self, step):
         """Scale of lr."""
         step += 1  # because step starts from 0
-        return self.normalize * min(step ** -0.5, step * self.warmup ** -1.5)
+        return self.normalize * min(step**-0.5, step * self.warmup**-1.5)
 
 
 @register_scheduler
diff --git a/espnet/transform/add_deltas.py b/espnet/transform/add_deltas.py
index 93f941c5f04..f80a1dbeb92 100644
--- a/espnet/transform/add_deltas.py
+++ b/espnet/transform/add_deltas.py
@@ -9,7 +9,7 @@ def delta(feat, window):
         delta_feat[i:] += -i * feat[:-i]
         delta_feat[-i:] += i * feat[-1]
         delta_feat[:i] += -i * feat[0]
-    delta_feat /= 2 * sum(i ** 2 for i in range(1, window + 1))
+    delta_feat /= 2 * sum(i**2 for i in range(1, window + 1))
     return delta_feat
 
 
diff --git a/espnet/transform/cmvn.py b/espnet/transform/cmvn.py
index 085b243841d..845d0ade3f4 100644
--- a/espnet/transform/cmvn.py
+++ b/espnet/transform/cmvn.py
@@ -130,14 +130,14 @@ def __repr__(self):
 
     def __call__(self, x, uttid=None):
         # x: [Time, Dim]
-        square_sums = (x ** 2).sum(axis=0)
+        square_sums = (x**2).sum(axis=0)
         mean = x.mean(axis=0)
 
         if self.norm_means:
             x = np.subtract(x, mean)
 
         if self.norm_vars:
-            var = square_sums / x.shape[0] - mean ** 2
+            var = square_sums / x.shape[0] - mean**2
             std = np.maximum(np.sqrt(var), self.std_floor)
             x = np.divide(x, std)
 
diff --git a/espnet/transform/perturb.py b/espnet/transform/perturb.py
index a05b72794c2..fdcd7521e0a 100644
--- a/espnet/transform/perturb.py
+++ b/espnet/transform/perturb.py
@@ -270,7 +270,7 @@ def __call__(self, x, uttid=None, train=True):
 
         if self.dbunit:
             ratio = 10 ** (ratio / 20)
-        scale = ratio * numpy.sqrt((x ** 2).mean())
+        scale = ratio * numpy.sqrt((x**2).mean())
 
         # 2. Get noise
         if self.utt2noise is not None:
@@ -281,7 +281,7 @@ def __call__(self, x, uttid=None, train=True):
                 # Randomly select the noise source
                 noise = self.state.choice(list(self.utt2noise.values()))
             # Normalize the level
-            noise /= numpy.sqrt((noise ** 2).mean())
+            noise /= numpy.sqrt((noise**2).mean())
 
             # Adjust the noise length
             diff = abs(len(x) - len(noise))
diff --git a/espnet/transform/spectrogram.py b/espnet/transform/spectrogram.py
index 518a00efea4..d653fa7ead4 100644
--- a/espnet/transform/spectrogram.py
+++ b/espnet/transform/spectrogram.py
@@ -76,7 +76,9 @@ def stft2logmelspectrogram(x_stft, fs, n_mels, n_fft, fmin=None, fmax=None, eps=
     # spc: (Time, Channel, Freq) or (Time, Freq)
     spc = np.abs(x_stft)
     # mel_basis: (Mel_freq, Freq)
-    mel_basis = librosa.filters.mel(fs, n_fft, n_mels, fmin, fmax)
+    mel_basis = librosa.filters.mel(
+        sr=fs, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax
+    )
     # lmspc: (Time, Channel, Mel_freq) or (Time, Mel_freq)
     lmspc = np.log10(np.maximum(eps, np.dot(spc, mel_basis.T)))
 
diff --git a/espnet/version.txt b/espnet/version.txt
index a78ad0f4234..94306f7cdd7 100644
--- a/espnet/version.txt
+++ b/espnet/version.txt
@@ -1 +1 @@
-0.10.6a1
+202204
diff --git a/espnet2/asr/ctc.py b/espnet2/asr/ctc.py
index 9f3473cf9a6..64b87106ac8 100644
--- a/espnet2/asr/ctc.py
+++ b/espnet2/asr/ctc.py
@@ -10,7 +10,7 @@ class CTC(torch.nn.Module):
 
     Args:
         odim: dimension of outputs
-        encoder_output_sizse: number of encoder projection units
+        encoder_output_size: number of encoder projection units
         dropout_rate: dropout rate (0.0 ~ 1.0)
         ctc_type: builtin or warpctc
         reduce: reduce the CTC loss into a scalar
@@ -19,7 +19,7 @@ class CTC(torch.nn.Module):
     def __init__(
         self,
         odim: int,
-        encoder_output_sizse: int,
+        encoder_output_size: int,
         dropout_rate: float = 0.0,
         ctc_type: str = "builtin",
         reduce: bool = True,
@@ -27,7 +27,7 @@ def __init__(
     ):
         assert check_argument_types()
         super().__init__()
-        eprojs = encoder_output_sizse
+        eprojs = encoder_output_size
         self.dropout_rate = dropout_rate
         self.ctc_lo = torch.nn.Linear(eprojs, odim)
         self.ctc_type = ctc_type
@@ -156,6 +156,16 @@ def forward(self, hs_pad, hlens, ys_pad, ys_lens):
 
         return loss
 
+    def softmax(self, hs_pad):
+        """softmax of frame activations
+
+        Args:
+            Tensor hs_pad: 3d tensor (B, Tmax, eprojs)
+        Returns:
+            torch.Tensor: softmax applied 3d tensor (B, Tmax, odim)
+        """
+        return F.softmax(self.ctc_lo(hs_pad), dim=2)
+
     def log_softmax(self, hs_pad):
         """log_softmax of frame activations
 
diff --git a/espnet2/asr/decoder/mlm_decoder.py b/espnet2/asr/decoder/mlm_decoder.py
new file mode 100644
index 00000000000..85cd1d3757f
--- /dev/null
+++ b/espnet2/asr/decoder/mlm_decoder.py
@@ -0,0 +1,130 @@
+# Copyright 2022 Yosuke Higuchi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Masked LM Decoder definition."""
+from typing import Tuple
+
+import torch
+from typeguard import check_argument_types
+
+from espnet.nets.pytorch_backend.nets_utils import make_pad_mask
+from espnet.nets.pytorch_backend.transformer.attention import MultiHeadedAttention
+from espnet.nets.pytorch_backend.transformer.decoder_layer import DecoderLayer
+from espnet.nets.pytorch_backend.transformer.embedding import PositionalEncoding
+from espnet.nets.pytorch_backend.transformer.layer_norm import LayerNorm
+from espnet.nets.pytorch_backend.transformer.positionwise_feed_forward import (
+    PositionwiseFeedForward,  # noqa: H301
+)
+from espnet.nets.pytorch_backend.transformer.repeat import repeat
+from espnet2.asr.decoder.abs_decoder import AbsDecoder
+
+
+class MLMDecoder(AbsDecoder):
+    def __init__(
+        self,
+        vocab_size: int,
+        encoder_output_size: int,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        self_attention_dropout_rate: float = 0.0,
+        src_attention_dropout_rate: float = 0.0,
+        input_layer: str = "embed",
+        use_output_layer: bool = True,
+        pos_enc_class=PositionalEncoding,
+        normalize_before: bool = True,
+        concat_after: bool = False,
+    ):
+        assert check_argument_types()
+        super().__init__()
+        attention_dim = encoder_output_size
+        vocab_size += 1  # for mask token
+
+        if input_layer == "embed":
+            self.embed = torch.nn.Sequential(
+                torch.nn.Embedding(vocab_size, attention_dim),
+                pos_enc_class(attention_dim, positional_dropout_rate),
+            )
+        elif input_layer == "linear":
+            self.embed = torch.nn.Sequential(
+                torch.nn.Linear(vocab_size, attention_dim),
+                torch.nn.LayerNorm(attention_dim),
+                torch.nn.Dropout(dropout_rate),
+                torch.nn.ReLU(),
+                pos_enc_class(attention_dim, positional_dropout_rate),
+            )
+        else:
+            raise ValueError(f"only 'embed' or 'linear' is supported: {input_layer}")
+
+        self.normalize_before = normalize_before
+        if self.normalize_before:
+            self.after_norm = LayerNorm(attention_dim)
+        if use_output_layer:
+            self.output_layer = torch.nn.Linear(attention_dim, vocab_size)
+        else:
+            self.output_layer = None
+
+        self.decoders = repeat(
+            num_blocks,
+            lambda lnum: DecoderLayer(
+                attention_dim,
+                MultiHeadedAttention(
+                    attention_heads, attention_dim, self_attention_dropout_rate
+                ),
+                MultiHeadedAttention(
+                    attention_heads, attention_dim, src_attention_dropout_rate
+                ),
+                PositionwiseFeedForward(attention_dim, linear_units, dropout_rate),
+                dropout_rate,
+                normalize_before,
+                concat_after,
+            ),
+        )
+
+    def forward(
+        self,
+        hs_pad: torch.Tensor,
+        hlens: torch.Tensor,
+        ys_in_pad: torch.Tensor,
+        ys_in_lens: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Forward decoder.
+
+        Args:
+            hs_pad: encoded memory, float32  (batch, maxlen_in, feat)
+            hlens: (batch)
+            ys_in_pad:
+                input token ids, int64 (batch, maxlen_out)
+                if input_layer == "embed"
+                input tensor (batch, maxlen_out, #mels) in the other cases
+            ys_in_lens: (batch)
+        Returns:
+            (tuple): tuple containing:
+            x: decoded token score before softmax (batch, maxlen_out, token)
+                if use_output_layer is True,
+            olens: (batch, )
+        """
+        tgt = ys_in_pad
+        # tgt_mask: (B, 1, L)
+        tgt_mask = (~make_pad_mask(ys_in_lens)[:, None, :]).to(tgt.device)
+        tgt_max_len = tgt_mask.size(-1)
+        # tgt_mask_tmp: (B, L, L)
+        tgt_mask_tmp = tgt_mask.transpose(1, 2).repeat(1, 1, tgt_max_len)
+        tgt_mask = tgt_mask.repeat(1, tgt_max_len, 1) & tgt_mask_tmp
+
+        memory = hs_pad
+        memory_mask = (~make_pad_mask(hlens))[:, None, :].to(memory.device)
+
+        x = self.embed(tgt)
+        x, tgt_mask, memory, memory_mask = self.decoders(
+            x, tgt_mask, memory, memory_mask
+        )
+        if self.normalize_before:
+            x = self.after_norm(x)
+        if self.output_layer is not None:
+            x = self.output_layer(x)
+
+        olens = tgt_mask.sum(1)
+        return x, olens
diff --git a/espnet2/asr/decoder/transformer_decoder.py b/espnet2/asr/decoder/transformer_decoder.py
index cc6d931a772..1bd74cb76c1 100644
--- a/espnet2/asr/decoder/transformer_decoder.py
+++ b/espnet2/asr/decoder/transformer_decoder.py
@@ -128,6 +128,12 @@ def forward(
         memory_mask = (~make_pad_mask(hlens, maxlen=memory.size(1)))[:, None, :].to(
             memory.device
         )
+        # Padding for Longformer
+        if memory_mask.shape[-1] != memory.shape[1]:
+            padlen = memory.shape[1] - memory_mask.shape[-1]
+            memory_mask = torch.nn.functional.pad(
+                memory_mask, (0, padlen), "constant", False
+            )
 
         x = self.embed(tgt)
         x, tgt_mask, memory, memory_mask = self.decoders(
diff --git a/espnet2/asr/encoder/conformer_encoder.py b/espnet2/asr/encoder/conformer_encoder.py
index 6011410b183..c0c3d92fd1c 100644
--- a/espnet2/asr/encoder/conformer_encoder.py
+++ b/espnet2/asr/encoder/conformer_encoder.py
@@ -3,8 +3,10 @@
 
 """Conformer encoder definition."""
 
+from typing import List
 from typing import Optional
 from typing import Tuple
+from typing import Union
 
 import logging
 import torch
@@ -39,6 +41,7 @@
 from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling6
 from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling8
 from espnet.nets.pytorch_backend.transformer.subsampling import TooShortUttError
+from espnet2.asr.ctc import CTC
 from espnet2.asr.encoder.abs_encoder import AbsEncoder
 
 
@@ -101,6 +104,9 @@ def __init__(
         zero_triu: bool = False,
         cnn_module_kernel: int = 31,
         padding_idx: int = -1,
+        interctc_layer_idx: List[int] = [],
+        interctc_use_conditioning: bool = False,
+        stochastic_depth_rate: Union[float, List[float]] = 0.0,
     ):
         assert check_argument_types()
         super().__init__()
@@ -246,6 +252,15 @@ def __init__(
         convolution_layer = ConvolutionModule
         convolution_layer_args = (output_size, cnn_module_kernel, activation)
 
+        if isinstance(stochastic_depth_rate, float):
+            stochastic_depth_rate = [stochastic_depth_rate] * num_blocks
+
+        if len(stochastic_depth_rate) != num_blocks:
+            raise ValueError(
+                f"Length of stochastic_depth_rate ({len(stochastic_depth_rate)}) "
+                f"should be equal to num_blocks ({num_blocks})"
+            )
+
         self.encoders = repeat(
             num_blocks,
             lambda lnum: EncoderLayer(
@@ -257,11 +272,18 @@ def __init__(
                 dropout_rate,
                 normalize_before,
                 concat_after,
+                stochastic_depth_rate[lnum],
             ),
         )
         if self.normalize_before:
             self.after_norm = LayerNorm(output_size)
 
+        self.interctc_layer_idx = interctc_layer_idx
+        if len(interctc_layer_idx) > 0:
+            assert 0 < min(interctc_layer_idx) and max(interctc_layer_idx) < num_blocks
+        self.interctc_use_conditioning = interctc_use_conditioning
+        self.conditioning_layer = None
+
     def output_size(self) -> int:
         return self._output_size
 
@@ -270,6 +292,7 @@ def forward(
         xs_pad: torch.Tensor,
         ilens: torch.Tensor,
         prev_states: torch.Tensor = None,
+        ctc: CTC = None,
     ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
         """Calculate forward propagation.
 
@@ -303,11 +326,41 @@ def forward(
             xs_pad, masks = self.embed(xs_pad, masks)
         else:
             xs_pad = self.embed(xs_pad)
-        xs_pad, masks = self.encoders(xs_pad, masks)
+
+        intermediate_outs = []
+        if len(self.interctc_layer_idx) == 0:
+            xs_pad, masks = self.encoders(xs_pad, masks)
+        else:
+            for layer_idx, encoder_layer in enumerate(self.encoders):
+                xs_pad, masks = encoder_layer(xs_pad, masks)
+
+                if layer_idx + 1 in self.interctc_layer_idx:
+                    encoder_out = xs_pad
+                    if isinstance(encoder_out, tuple):
+                        encoder_out = encoder_out[0]
+
+                    # intermediate outputs are also normalized
+                    if self.normalize_before:
+                        encoder_out = self.after_norm(encoder_out)
+
+                    intermediate_outs.append((layer_idx + 1, encoder_out))
+
+                    if self.interctc_use_conditioning:
+                        ctc_out = ctc.softmax(encoder_out)
+
+                        if isinstance(xs_pad, tuple):
+                            x, pos_emb = xs_pad
+                            x = x + self.conditioning_layer(ctc_out)
+                            xs_pad = (x, pos_emb)
+                        else:
+                            xs_pad = xs_pad + self.conditioning_layer(ctc_out)
+
         if isinstance(xs_pad, tuple):
             xs_pad = xs_pad[0]
         if self.normalize_before:
             xs_pad = self.after_norm(xs_pad)
 
         olens = masks.squeeze(1).sum(1)
+        if len(intermediate_outs) > 0:
+            return (xs_pad, intermediate_outs), olens, None
         return xs_pad, olens, None
diff --git a/espnet2/asr/encoder/contextual_block_conformer_encoder.py b/espnet2/asr/encoder/contextual_block_conformer_encoder.py
index d3c565b1fba..7152e34d44a 100644
--- a/espnet2/asr/encoder/contextual_block_conformer_encoder.py
+++ b/espnet2/asr/encoder/contextual_block_conformer_encoder.py
@@ -258,7 +258,7 @@ def forward_train(
         # block_size could be 0 meaning infinite
         # apply usual encoder for short sequence
         if self.block_size == 0 or total_frame_num <= self.block_size:
-            xs_pad, masks, _, _, _, _ = self.encoders(
+            xs_pad, masks, _, _, _, _, _ = self.encoders(
                 self.pos_enc(xs_pad), masks, False, None, None
             )
             if self.normalize_before:
@@ -350,7 +350,7 @@ def forward_train(
         xs_chunk[:, :, self.block_size + 1] = addin
 
         # forward
-        ys_chunk, mask_online, _, _, _, _ = self.encoders(
+        ys_chunk, mask_online, _, _, _, _, _ = self.encoders(
             xs_chunk, mask_online, False, xs_chunk
         )
 
@@ -535,8 +535,16 @@ def forward_infer(
 
             prev_addin = addin
 
+        # mask setup, it should be the same to that of forward_train
+        mask_online = xs_pad.new_zeros(
+            xs_pad.size(0), block_num, self.block_size + 2, self.block_size + 2
+        )
+        mask_online.narrow(2, 1, self.block_size + 1).narrow(
+            3, 0, self.block_size + 1
+        ).fill_(1)
+
         ys_chunk, _, _, _, past_encoder_ctx, _, _ = self.encoders(
-            xs_chunk, None, True, past_encoder_ctx
+            xs_chunk, mask_online, True, past_encoder_ctx
         )
 
         # remove addin
diff --git a/espnet2/asr/encoder/contextual_block_transformer_encoder.py b/espnet2/asr/encoder/contextual_block_transformer_encoder.py
index 888582925a5..ec3b7e28193 100644
--- a/espnet2/asr/encoder/contextual_block_transformer_encoder.py
+++ b/espnet2/asr/encoder/contextual_block_transformer_encoder.py
@@ -234,7 +234,7 @@ def forward_train(
         # block_size could be 0 meaning infinite
         # apply usual encoder for short sequence
         if self.block_size == 0 or total_frame_num <= self.block_size:
-            xs_pad, masks, _, _, _, _ = self.encoders(
+            xs_pad, masks, _, _, _, _, _ = self.encoders(
                 self.pos_enc(xs_pad), masks, False, None, None
             )
             if self.normalize_before:
@@ -326,7 +326,7 @@ def forward_train(
         xs_chunk[:, :, self.block_size + 1] = addin
 
         # forward
-        ys_chunk, mask_online, _, _, _, _ = self.encoders(
+        ys_chunk, mask_online, _, _, _, _, _ = self.encoders(
             xs_chunk, mask_online, False, xs_chunk
         )
 
@@ -511,8 +511,16 @@ def forward_infer(
 
             prev_addin = addin
 
+        # mask setup, it should be the same to that of forward_train
+        mask_online = xs_pad.new_zeros(
+            xs_pad.size(0), block_num, self.block_size + 2, self.block_size + 2
+        )
+        mask_online.narrow(2, 1, self.block_size + 1).narrow(
+            3, 0, self.block_size + 1
+        ).fill_(1)
+
         ys_chunk, _, _, _, past_encoder_ctx, _, _ = self.encoders(
-            xs_chunk, None, True, past_encoder_ctx
+            xs_chunk, mask_online, True, past_encoder_ctx
         )
 
         # remove addin
diff --git a/espnet2/asr/encoder/longformer_encoder.py b/espnet2/asr/encoder/longformer_encoder.py
new file mode 100644
index 00000000000..1d9dcfcc864
--- /dev/null
+++ b/espnet2/asr/encoder/longformer_encoder.py
@@ -0,0 +1,374 @@
+# Copyright 2020 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Conformer encoder definition."""
+
+from typing import List
+from typing import Optional
+from typing import Tuple
+
+import torch
+
+from typeguard import check_argument_types
+
+from espnet.nets.pytorch_backend.conformer.convolution import ConvolutionModule
+from espnet.nets.pytorch_backend.conformer.encoder_layer import EncoderLayer
+from espnet.nets.pytorch_backend.nets_utils import get_activation
+from espnet.nets.pytorch_backend.nets_utils import make_pad_mask
+from espnet.nets.pytorch_backend.transformer.embedding import (
+    PositionalEncoding,  # noqa: H301
+)
+from espnet.nets.pytorch_backend.transformer.layer_norm import LayerNorm
+from espnet.nets.pytorch_backend.transformer.multi_layer_conv import Conv1dLinear
+from espnet.nets.pytorch_backend.transformer.multi_layer_conv import MultiLayeredConv1d
+from espnet.nets.pytorch_backend.transformer.positionwise_feed_forward import (
+    PositionwiseFeedForward,  # noqa: H301
+)
+from espnet.nets.pytorch_backend.transformer.repeat import repeat
+from espnet.nets.pytorch_backend.transformer.subsampling import check_short_utt
+from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling
+from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling2
+from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling6
+from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling8
+from espnet.nets.pytorch_backend.transformer.subsampling import TooShortUttError
+from espnet2.asr.ctc import CTC
+from espnet2.asr.encoder.conformer_encoder import ConformerEncoder
+
+
+class LongformerEncoder(ConformerEncoder):
+    """Longformer SA Conformer encoder module.
+
+    Args:
+        input_size (int): Input dimension.
+        output_size (int): Dimension of attention.
+        attention_heads (int): The number of heads of multi head attention.
+        linear_units (int): The number of units of position-wise feed forward.
+        num_blocks (int): The number of decoder blocks.
+        dropout_rate (float): Dropout rate.
+        attention_dropout_rate (float): Dropout rate in attention.
+        positional_dropout_rate (float): Dropout rate after adding positional encoding.
+        input_layer (Union[str, torch.nn.Module]): Input layer type.
+        normalize_before (bool): Whether to use layer_norm before the first block.
+        concat_after (bool): Whether to concat attention layer's input and output.
+            If True, additional linear will be applied.
+            i.e. x -> x + linear(concat(x, att(x)))
+            If False, no additional linear will be applied. i.e. x -> x + att(x)
+        positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
+        positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
+        rel_pos_type (str): Whether to use the latest relative positional encoding or
+            the legacy one. The legacy relative positional encoding will be deprecated
+            in the future. More Details can be found in
+            https://github.com/espnet/espnet/pull/2816.
+        encoder_pos_enc_layer_type (str): Encoder positional encoding layer type.
+        encoder_attn_layer_type (str): Encoder attention layer type.
+        activation_type (str): Encoder activation function type.
+        macaron_style (bool): Whether to use macaron style for positionwise layer.
+        use_cnn_module (bool): Whether to use convolution module.
+        zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
+        cnn_module_kernel (int): Kernerl size of convolution module.
+        padding_idx (int): Padding idx for input_layer=embed.
+        attention_windows (list): Layer-wise attention window sizes
+            for longformer self-attn
+        attention_dilation(list): Layer-wise attention dilation sizes
+            for longformer self-attn
+        attention_mode(str): Implementation for longformer self-attn.
+            Default="sliding_chunks"
+            Choose 'n2', 'tvm' or 'sliding_chunks'. More details in
+            https://github.com/allenai/longformer
+
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int = 256,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        attention_dropout_rate: float = 0.0,
+        input_layer: str = "conv2d",
+        normalize_before: bool = True,
+        concat_after: bool = False,
+        positionwise_layer_type: str = "linear",
+        positionwise_conv_kernel_size: int = 3,
+        macaron_style: bool = False,
+        rel_pos_type: str = "legacy",
+        pos_enc_layer_type: str = "abs_pos",
+        selfattention_layer_type: str = "lf_selfattn",
+        activation_type: str = "swish",
+        use_cnn_module: bool = True,
+        zero_triu: bool = False,
+        cnn_module_kernel: int = 31,
+        padding_idx: int = -1,
+        interctc_layer_idx: List[int] = [],
+        interctc_use_conditioning: bool = False,
+        attention_windows: list = [100, 100, 100, 100, 100, 100],
+        attention_dilation: list = [1, 1, 1, 1, 1, 1],
+        attention_mode: str = "sliding_chunks",
+    ):
+        assert check_argument_types()
+        super().__init__(input_size)
+        self._output_size = output_size
+
+        activation = get_activation(activation_type)
+
+        if pos_enc_layer_type == "abs_pos":
+            pos_enc_class = PositionalEncoding
+        else:
+            raise ValueError(
+                "incorrect or unknown pos_enc_layer: "
+                + pos_enc_layer_type
+                + "Use abs_pos"
+            )
+
+        if len(attention_dilation) != num_blocks:
+            raise ValueError(
+                "incorrect attention_dilation parameter of length"
+                + str(len(attention_dilation))
+                + " does not match num_blocks"
+                + str(num_blocks)
+            )
+
+        if len(attention_windows) != num_blocks:
+            raise ValueError(
+                "incorrect attention_windows parameter of length"
+                + str(len(attention_windows))
+                + " does not match num_blocks"
+                + str(num_blocks)
+            )
+
+        if attention_mode != "tvm" and max(attention_dilation) != 1:
+            raise ValueError(
+                "incorrect attention mode for dilation: "
+                + attention_mode
+                + "Use attention_mode=tvm with Cuda Kernel"
+            )
+
+        if input_layer == "linear":
+            self.embed = torch.nn.Sequential(
+                torch.nn.Linear(input_size, output_size),
+                torch.nn.LayerNorm(output_size),
+                torch.nn.Dropout(dropout_rate),
+                pos_enc_class(output_size, positional_dropout_rate),
+            )
+        elif input_layer == "conv2d":
+            self.embed = Conv2dSubsampling(
+                input_size,
+                output_size,
+                dropout_rate,
+                pos_enc_class(output_size, positional_dropout_rate),
+            )
+        elif input_layer == "conv2d2":
+            self.embed = Conv2dSubsampling2(
+                input_size,
+                output_size,
+                dropout_rate,
+                pos_enc_class(output_size, positional_dropout_rate),
+            )
+        elif input_layer == "conv2d6":
+            self.embed = Conv2dSubsampling6(
+                input_size,
+                output_size,
+                dropout_rate,
+                pos_enc_class(output_size, positional_dropout_rate),
+            )
+        elif input_layer == "conv2d8":
+            self.embed = Conv2dSubsampling8(
+                input_size,
+                output_size,
+                dropout_rate,
+                pos_enc_class(output_size, positional_dropout_rate),
+            )
+        elif input_layer == "embed":
+            self.embed = torch.nn.Sequential(
+                torch.nn.Embedding(input_size, output_size, padding_idx=padding_idx),
+                pos_enc_class(output_size, positional_dropout_rate),
+            )
+        elif isinstance(input_layer, torch.nn.Module):
+            self.embed = torch.nn.Sequential(
+                input_layer,
+                pos_enc_class(output_size, positional_dropout_rate),
+            )
+        elif input_layer is None:
+            self.embed = torch.nn.Sequential(
+                pos_enc_class(output_size, positional_dropout_rate)
+            )
+        else:
+            raise ValueError("unknown input_layer: " + input_layer)
+
+        self.normalize_before = normalize_before
+        if positionwise_layer_type == "linear":
+            positionwise_layer = PositionwiseFeedForward
+            positionwise_layer_args = (
+                output_size,
+                linear_units,
+                dropout_rate,
+                activation,
+            )
+        elif positionwise_layer_type == "conv1d":
+            positionwise_layer = MultiLayeredConv1d
+            positionwise_layer_args = (
+                output_size,
+                linear_units,
+                positionwise_conv_kernel_size,
+                dropout_rate,
+            )
+        elif positionwise_layer_type == "conv1d-linear":
+            positionwise_layer = Conv1dLinear
+            positionwise_layer_args = (
+                output_size,
+                linear_units,
+                positionwise_conv_kernel_size,
+                dropout_rate,
+            )
+        else:
+            raise NotImplementedError("Support only linear or conv1d.")
+        self.selfattention_layer_type = selfattention_layer_type
+        if selfattention_layer_type == "lf_selfattn":
+            assert pos_enc_layer_type == "abs_pos"
+            from espnet.nets.pytorch_backend.transformer.longformer_attention import (
+                LongformerAttention,  # noqa: H301
+            )
+            from longformer.longformer import LongformerConfig
+
+            encoder_selfattn_layer = LongformerAttention
+
+            config = LongformerConfig(
+                attention_window=attention_windows,
+                attention_dilation=attention_dilation,
+                autoregressive=False,
+                num_attention_heads=attention_heads,
+                hidden_size=output_size,
+                attention_probs_dropout_prob=dropout_rate,
+                attention_mode=attention_mode,
+            )
+            encoder_selfattn_layer_args = (config,)
+        else:
+            raise ValueError(
+                "incompatible or unknown encoder_attn_layer: "
+                + selfattention_layer_type
+                + " Use lf_selfattn"
+            )
+
+        convolution_layer = ConvolutionModule
+        convolution_layer_args = (output_size, cnn_module_kernel, activation)
+
+        self.encoders = repeat(
+            num_blocks,
+            lambda layer_id: EncoderLayer(
+                output_size,
+                encoder_selfattn_layer(*(encoder_selfattn_layer_args + (layer_id,))),
+                positionwise_layer(*positionwise_layer_args),
+                positionwise_layer(*positionwise_layer_args) if macaron_style else None,
+                convolution_layer(*convolution_layer_args) if use_cnn_module else None,
+                dropout_rate,
+                normalize_before,
+                concat_after,
+            ),
+        )
+
+        if self.normalize_before:
+            self.after_norm = LayerNorm(output_size)
+
+        self.interctc_layer_idx = interctc_layer_idx
+        if len(interctc_layer_idx) > 0:
+            assert 0 < min(interctc_layer_idx) and max(interctc_layer_idx) < num_blocks
+        self.interctc_use_conditioning = interctc_use_conditioning
+        self.conditioning_layer = None
+
+    def output_size(self) -> int:
+        return self._output_size
+
+    def forward(
+        self,
+        xs_pad: torch.Tensor,
+        ilens: torch.Tensor,
+        prev_states: torch.Tensor = None,
+        ctc: CTC = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        """Calculate forward propagation.
+
+        Args:
+            xs_pad (torch.Tensor): Input tensor (#batch, L, input_size).
+            ilens (torch.Tensor): Input length (#batch).
+            prev_states (torch.Tensor): Not to be used now.
+
+        Returns:
+            torch.Tensor: Output tensor (#batch, L, output_size).
+            torch.Tensor: Output length (#batch).
+            torch.Tensor: Not to be used now.
+
+        """
+
+        masks = (~make_pad_mask(ilens)[:, None, :]).to(xs_pad.device)
+        if (
+            isinstance(self.embed, Conv2dSubsampling)
+            or isinstance(self.embed, Conv2dSubsampling2)
+            or isinstance(self.embed, Conv2dSubsampling6)
+            or isinstance(self.embed, Conv2dSubsampling8)
+        ):
+            short_status, limit_size = check_short_utt(self.embed, xs_pad.size(1))
+            if short_status:
+                raise TooShortUttError(
+                    f"has {xs_pad.size(1)} frames and is too short for subsampling "
+                    + f"(it needs more than {limit_size} frames), return empty results",
+                    xs_pad.size(1),
+                    limit_size,
+                )
+            xs_pad, masks = self.embed(xs_pad, masks)
+        else:
+            xs_pad = self.embed(xs_pad)
+
+        if self.selfattention_layer_type == "lf_selfattn":
+            seq_len = xs_pad.shape[1]
+            attention_window = (
+                max([x.self_attn.attention_window for x in self.encoders]) * 2
+            )
+            padding_len = (
+                attention_window - seq_len % attention_window
+            ) % attention_window
+            xs_pad = torch.nn.functional.pad(
+                xs_pad, (0, 0, 0, padding_len), "constant", 0
+            )
+            masks = torch.nn.functional.pad(masks, (0, padding_len), "constant", False)
+
+        xs_pad, masks = self.encoders(xs_pad, masks)
+        intermediate_outs = []
+        if len(self.interctc_layer_idx) == 0:
+            xs_pad, masks = self.encoders(xs_pad, masks)
+        else:
+            for layer_idx, encoder_layer in enumerate(self.encoders):
+                xs_pad, masks = encoder_layer(xs_pad, masks)
+
+                if layer_idx + 1 in self.interctc_layer_idx:
+                    encoder_out = xs_pad
+                    if isinstance(encoder_out, tuple):
+                        encoder_out = encoder_out[0]
+
+                    # intermediate outputs are also normalized
+                    if self.normalize_before:
+                        encoder_out = self.after_norm(encoder_out)
+
+                    intermediate_outs.append((layer_idx + 1, encoder_out))
+
+                    if self.interctc_use_conditioning:
+                        ctc_out = ctc.softmax(encoder_out)
+
+                        if isinstance(xs_pad, tuple):
+                            x, pos_emb = xs_pad
+                            x = x + self.conditioning_layer(ctc_out)
+                            xs_pad = (x, pos_emb)
+                        else:
+                            xs_pad = xs_pad + self.conditioning_layer(ctc_out)
+
+        if isinstance(xs_pad, tuple):
+            xs_pad = xs_pad[0]
+        if self.normalize_before:
+            xs_pad = self.after_norm(xs_pad)
+
+        olens = masks.squeeze(1).sum(1)
+        if len(intermediate_outs) > 0:
+            return (xs_pad, intermediate_outs), olens, None
+        return xs_pad, olens, None
diff --git a/espnet2/asr/encoder/transformer_encoder.py b/espnet2/asr/encoder/transformer_encoder.py
index dff1686ffa8..b11cb8c25d3 100644
--- a/espnet2/asr/encoder/transformer_encoder.py
+++ b/espnet2/asr/encoder/transformer_encoder.py
@@ -1,7 +1,9 @@
 # Copyright 2019 Shigeki Karita
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 
-"""Encoder definition."""
+"""Transformer encoder definition."""
+
+from typing import List
 from typing import Optional
 from typing import Tuple
 
@@ -25,6 +27,7 @@
 from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling6
 from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling8
 from espnet.nets.pytorch_backend.transformer.subsampling import TooShortUttError
+from espnet2.asr.ctc import CTC
 from espnet2.asr.encoder.abs_encoder import AbsEncoder
 
 
@@ -70,6 +73,8 @@ def __init__(
         positionwise_layer_type: str = "linear",
         positionwise_conv_kernel_size: int = 1,
         padding_idx: int = -1,
+        interctc_layer_idx: List[int] = [],
+        interctc_use_conditioning: bool = False,
     ):
         assert check_argument_types()
         super().__init__()
@@ -97,9 +102,10 @@ def __init__(
                 pos_enc_class(output_size, positional_dropout_rate),
             )
         elif input_layer is None:
-            self.embed = torch.nn.Sequential(
-                pos_enc_class(output_size, positional_dropout_rate)
-            )
+            if input_size == output_size:
+                self.embed = None
+            else:
+                self.embed = torch.nn.Linear(input_size, output_size)
         else:
             raise ValueError("unknown input_layer: " + input_layer)
         self.normalize_before = normalize_before
@@ -144,6 +150,12 @@ def __init__(
         if self.normalize_before:
             self.after_norm = LayerNorm(output_size)
 
+        self.interctc_layer_idx = interctc_layer_idx
+        if len(interctc_layer_idx) > 0:
+            assert 0 < min(interctc_layer_idx) and max(interctc_layer_idx) < num_blocks
+        self.interctc_use_conditioning = interctc_use_conditioning
+        self.conditioning_layer = None
+
     def output_size(self) -> int:
         return self._output_size
 
@@ -152,6 +164,7 @@ def forward(
         xs_pad: torch.Tensor,
         ilens: torch.Tensor,
         prev_states: torch.Tensor = None,
+        ctc: CTC = None,
     ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
         """Embed positions in tensor.
 
@@ -164,7 +177,9 @@ def forward(
         """
         masks = (~make_pad_mask(ilens)[:, None, :]).to(xs_pad.device)
 
-        if (
+        if self.embed is None:
+            xs_pad = xs_pad
+        elif (
             isinstance(self.embed, Conv2dSubsampling)
             or isinstance(self.embed, Conv2dSubsampling2)
             or isinstance(self.embed, Conv2dSubsampling6)
@@ -181,9 +196,31 @@ def forward(
             xs_pad, masks = self.embed(xs_pad, masks)
         else:
             xs_pad = self.embed(xs_pad)
-        xs_pad, masks = self.encoders(xs_pad, masks)
+
+        intermediate_outs = []
+        if len(self.interctc_layer_idx) == 0:
+            xs_pad, masks = self.encoders(xs_pad, masks)
+        else:
+            for layer_idx, encoder_layer in enumerate(self.encoders):
+                xs_pad, masks = encoder_layer(xs_pad, masks)
+
+                if layer_idx + 1 in self.interctc_layer_idx:
+                    encoder_out = xs_pad
+
+                    # intermediate outputs are also normalized
+                    if self.normalize_before:
+                        encoder_out = self.after_norm(encoder_out)
+
+                    intermediate_outs.append((layer_idx + 1, encoder_out))
+
+                    if self.interctc_use_conditioning:
+                        ctc_out = ctc.softmax(encoder_out)
+                        xs_pad = xs_pad + self.conditioning_layer(ctc_out)
+
         if self.normalize_before:
             xs_pad = self.after_norm(xs_pad)
 
         olens = masks.squeeze(1).sum(1)
+        if len(intermediate_outs) > 0:
+            return (xs_pad, intermediate_outs), olens, None
         return xs_pad, olens, None
diff --git a/espnet2/asr/espnet_model.py b/espnet2/asr/espnet_model.py
index b6684f05ae8..08c10182a83 100644
--- a/espnet2/asr/espnet_model.py
+++ b/espnet2/asr/espnet_model.py
@@ -55,6 +55,7 @@ def __init__(
         ctc: CTC,
         joint_network: Optional[torch.nn.Module],
         ctc_weight: float = 0.5,
+        interctc_weight: float = 0.0,
         ignore_id: int = -1,
         lsm_weight: float = 0.0,
         length_normalized_loss: bool = False,
@@ -66,6 +67,7 @@ def __init__(
     ):
         assert check_argument_types()
         assert 0.0 <= ctc_weight <= 1.0, ctc_weight
+        assert 0.0 <= interctc_weight < 1.0, interctc_weight
 
         super().__init__()
         # note that eos is the same as sos (equivalent ID)
@@ -75,6 +77,7 @@ def __init__(
         self.vocab_size = vocab_size
         self.ignore_id = ignore_id
         self.ctc_weight = ctc_weight
+        self.interctc_weight = interctc_weight
         self.token_list = token_list.copy()
 
         self.frontend = frontend
@@ -84,6 +87,13 @@ def __init__(
         self.postencoder = postencoder
         self.encoder = encoder
 
+        if not hasattr(self.encoder, "interctc_use_conditioning"):
+            self.encoder.interctc_use_conditioning = False
+        if self.encoder.interctc_use_conditioning:
+            self.encoder.conditioning_layer = torch.nn.Linear(
+                vocab_size, self.encoder.output_size()
+            )
+
         self.use_transducer_decoder = joint_network is not None
 
         self.error_calculator = None
@@ -151,6 +161,7 @@ def forward(
         speech_lengths: torch.Tensor,
         text: torch.Tensor,
         text_lengths: torch.Tensor,
+        **kwargs,
     ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
         """Frontend + Encoder + Decoder + Calc loss
 
@@ -159,6 +170,7 @@ def forward(
             speech_lengths: (Batch, )
             text: (Batch, Length)
             text_lengths: (Batch,)
+            kwargs: "utt_id" is among the input.
         """
         assert text_lengths.dim() == 1, text_lengths.shape
         # Check that batch_size is unified
@@ -175,10 +187,15 @@ def forward(
 
         # 1. Encoder
         encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
+        intermediate_outs = None
+        if isinstance(encoder_out, tuple):
+            intermediate_outs = encoder_out[1]
+            encoder_out = encoder_out[0]
 
         loss_att, acc_att, cer_att, wer_att = None, None, None, None
         loss_ctc, cer_ctc = None, None
         loss_transducer, cer_transducer, wer_transducer = None, None, None
+        stats = dict()
 
         # 1. CTC branch
         if self.ctc_weight != 0.0:
@@ -186,6 +203,34 @@ def forward(
                 encoder_out, encoder_out_lens, text, text_lengths
             )
 
+            # Collect CTC branch stats
+            stats["loss_ctc"] = loss_ctc.detach() if loss_ctc is not None else None
+            stats["cer_ctc"] = cer_ctc
+
+        # Intermediate CTC (optional)
+        loss_interctc = 0.0
+        if self.interctc_weight != 0.0 and intermediate_outs is not None:
+            for layer_idx, intermediate_out in intermediate_outs:
+                # we assume intermediate_out has the same length & padding
+                # as those of encoder_out
+                loss_ic, cer_ic = self._calc_ctc_loss(
+                    intermediate_out, encoder_out_lens, text, text_lengths
+                )
+                loss_interctc = loss_interctc + loss_ic
+
+                # Collect Intermedaite CTC stats
+                stats["loss_interctc_layer{}".format(layer_idx)] = (
+                    loss_ic.detach() if loss_ic is not None else None
+                )
+                stats["cer_interctc_layer{}".format(layer_idx)] = cer_ic
+
+            loss_interctc = loss_interctc / len(intermediate_outs)
+
+            # calculate whole encoder loss
+            loss_ctc = (
+                1 - self.interctc_weight
+            ) * loss_ctc + self.interctc_weight * loss_interctc
+
         if self.use_transducer_decoder:
             # 2a. Transducer decoder branch
             (
@@ -202,6 +247,14 @@ def forward(
                 loss = loss_transducer + (self.ctc_weight * loss_ctc)
             else:
                 loss = loss_transducer
+
+            # Collect Transducer branch stats
+            stats["loss_transducer"] = (
+                loss_transducer.detach() if loss_transducer is not None else None
+            )
+            stats["cer_transducer"] = cer_transducer
+            stats["wer_transducer"] = wer_transducer
+
         else:
             # 2b. Attention decoder branch
             if self.ctc_weight != 1.0:
@@ -217,20 +270,14 @@ def forward(
             else:
                 loss = self.ctc_weight * loss_ctc + (1 - self.ctc_weight) * loss_att
 
-        stats = dict(
-            loss=loss.detach(),
-            loss_att=loss_att.detach() if loss_att is not None else None,
-            loss_ctc=loss_ctc.detach() if loss_ctc is not None else None,
-            loss_transducer=loss_transducer.detach()
-            if loss_transducer is not None
-            else None,
-            acc=acc_att,
-            cer=cer_att,
-            wer=wer_att,
-            cer_ctc=cer_ctc,
-            cer_transducer=cer_transducer,
-            wer_transducer=wer_transducer,
-        )
+            # Collect Attn branch stats
+            stats["loss_att"] = loss_att.detach() if loss_att is not None else None
+            stats["acc"] = acc_att
+            stats["cer"] = cer_att
+            stats["wer"] = wer_att
+
+        # Collect total loss stats
+        stats["loss"] = loss.detach()
 
         # force_gatherable: to-device and to-tensor if scalar for DataParallel
         loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
@@ -242,6 +289,7 @@ def collect_feats(
         speech_lengths: torch.Tensor,
         text: torch.Tensor,
         text_lengths: torch.Tensor,
+        **kwargs,
     ) -> Dict[str, torch.Tensor]:
         if self.extract_feats_in_collect_stats:
             feats, feats_lengths = self._extract_feats(speech, speech_lengths)
@@ -283,7 +331,16 @@ def encode(
         # 4. Forward encoder
         # feats: (Batch, Length, Dim)
         # -> encoder_out: (Batch, Length2, Dim2)
-        encoder_out, encoder_out_lens, _ = self.encoder(feats, feats_lengths)
+        if self.encoder.interctc_use_conditioning:
+            encoder_out, encoder_out_lens, _ = self.encoder(
+                feats, feats_lengths, ctc=self.ctc
+            )
+        else:
+            encoder_out, encoder_out_lens, _ = self.encoder(feats, feats_lengths)
+        intermediate_outs = None
+        if isinstance(encoder_out, tuple):
+            intermediate_outs = encoder_out[1]
+            encoder_out = encoder_out[0]
 
         # Post-encoder, e.g. NLU
         if self.postencoder is not None:
@@ -300,6 +357,9 @@ def encode(
             encoder_out_lens.max(),
         )
 
+        if intermediate_outs is not None:
+            return (encoder_out, intermediate_outs), encoder_out_lens
+
         return encoder_out, encoder_out_lens
 
     def _extract_feats(
diff --git a/espnet2/asr/frontend/default.py b/espnet2/asr/frontend/default.py
index 25a2f8ed2b8..a2aa62c133e 100644
--- a/espnet2/asr/frontend/default.py
+++ b/espnet2/asr/frontend/default.py
@@ -109,7 +109,7 @@ def forward(
 
         # 4. STFT -> Power spectrum
         # h: ComplexTensor(B, T, F) -> torch.Tensor(B, T, F)
-        input_power = input_stft.real ** 2 + input_stft.imag ** 2
+        input_power = input_stft.real**2 + input_stft.imag**2
 
         # 5. Feature transform e.g. Stft -> Log-Mel-Fbank
         # input_power: (Batch, [Channel,] Length, Freq)
diff --git a/espnet2/asr/frontend/s3prl.py b/espnet2/asr/frontend/s3prl.py
index 4fe53970380..6a497e0fab7 100644
--- a/espnet2/asr/frontend/s3prl.py
+++ b/espnet2/asr/frontend/s3prl.py
@@ -86,10 +86,10 @@ def _get_upstream(self, frontend_conf):
 
         from s3prl.upstream.interfaces import Featurizer
 
-        if self.multilayer_feature is None:
-            feature_selection = "last_hidden_state"
-        else:
+        if self.multilayer_feature:
             feature_selection = "hidden_states"
+        else:
+            feature_selection = "last_hidden_state"
         s3prl_featurizer = Featurizer(
             upstream=s3prl_upstream,
             feature_selection=feature_selection,
@@ -123,8 +123,7 @@ def forward(
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         wavs = [wav[: input_lengths[i]] for i, wav in enumerate(input)]
         self.upstream.eval()
-        with torch.no_grad():
-            feats = self.upstream(wavs)
+        feats = self.upstream(wavs)
         feats = self.featurizer(wavs, feats)
 
         if self.args.tile_factor != 1:
diff --git a/espnet2/asr/maskctc_model.py b/espnet2/asr/maskctc_model.py
new file mode 100644
index 00000000000..26cf7a90956
--- /dev/null
+++ b/espnet2/asr/maskctc_model.py
@@ -0,0 +1,347 @@
+from contextlib import contextmanager
+from distutils.version import LooseVersion
+from itertools import groupby
+import logging
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import numpy
+import torch
+from typeguard import check_argument_types
+
+from espnet.nets.beam_search import Hypothesis
+from espnet.nets.e2e_asr_common import ErrorCalculator
+from espnet.nets.pytorch_backend.maskctc.add_mask_token import mask_uniform
+from espnet.nets.pytorch_backend.nets_utils import th_accuracy
+from espnet.nets.pytorch_backend.transformer.label_smoothing_loss import (
+    LabelSmoothingLoss,  # noqa: H301
+)
+from espnet2.asr.ctc import CTC
+from espnet2.asr.decoder.mlm_decoder import MLMDecoder
+from espnet2.asr.encoder.abs_encoder import AbsEncoder
+from espnet2.asr.espnet_model import ESPnetASRModel
+from espnet2.asr.frontend.abs_frontend import AbsFrontend
+from espnet2.asr.postencoder.abs_postencoder import AbsPostEncoder
+from espnet2.asr.preencoder.abs_preencoder import AbsPreEncoder
+from espnet2.asr.specaug.abs_specaug import AbsSpecAug
+from espnet2.layers.abs_normalize import AbsNormalize
+from espnet2.text.token_id_converter import TokenIDConverter
+from espnet2.torch_utils.device_funcs import force_gatherable
+
+if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"):
+    from torch.cuda.amp import autocast
+else:
+    # Nothing to do if torch<1.6.0
+    @contextmanager
+    def autocast(enabled=True):
+        yield
+
+
+class MaskCTCModel(ESPnetASRModel):
+    """Hybrid CTC/Masked LM Encoder-Decoder model (Mask-CTC)"""
+
+    def __init__(
+        self,
+        vocab_size: int,
+        token_list: Union[Tuple[str, ...], List[str]],
+        frontend: Optional[AbsFrontend],
+        specaug: Optional[AbsSpecAug],
+        normalize: Optional[AbsNormalize],
+        preencoder: Optional[AbsPreEncoder],
+        encoder: AbsEncoder,
+        postencoder: Optional[AbsPostEncoder],
+        decoder: MLMDecoder,
+        ctc: CTC,
+        joint_network: Optional[torch.nn.Module] = None,
+        ctc_weight: float = 0.5,
+        interctc_weight: float = 0.0,
+        ignore_id: int = -1,
+        lsm_weight: float = 0.0,
+        length_normalized_loss: bool = False,
+        report_cer: bool = True,
+        report_wer: bool = True,
+        sym_space: str = "<space>",
+        sym_blank: str = "<blank>",
+        sym_mask: str = "<mask>",
+        extract_feats_in_collect_stats: bool = True,
+    ):
+        assert check_argument_types()
+
+        super().__init__(
+            vocab_size=vocab_size,
+            token_list=token_list,
+            frontend=frontend,
+            specaug=specaug,
+            normalize=normalize,
+            preencoder=preencoder,
+            encoder=encoder,
+            postencoder=postencoder,
+            decoder=decoder,
+            ctc=ctc,
+            joint_network=joint_network,
+            ctc_weight=ctc_weight,
+            interctc_weight=interctc_weight,
+            ignore_id=ignore_id,
+            lsm_weight=lsm_weight,
+            length_normalized_loss=length_normalized_loss,
+            report_cer=report_cer,
+            report_wer=report_wer,
+            sym_space=sym_space,
+            sym_blank=sym_blank,
+            extract_feats_in_collect_stats=extract_feats_in_collect_stats,
+        )
+
+        # Add <mask> and override inherited fields
+        token_list.append(sym_mask)
+        vocab_size += 1
+        self.vocab_size = vocab_size
+        self.mask_token = vocab_size - 1
+        self.token_list = token_list.copy()
+
+        # MLM loss
+        del self.criterion_att
+        self.criterion_mlm = LabelSmoothingLoss(
+            size=vocab_size,
+            padding_idx=ignore_id,
+            smoothing=lsm_weight,
+            normalize_length=length_normalized_loss,
+        )
+
+        self.error_calculator = None
+        if report_cer or report_wer:
+            self.error_calculator = ErrorCalculator(
+                token_list, sym_space, sym_blank, report_cer, report_wer
+            )
+
+    def forward(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
+        """Frontend + Encoder + Decoder + Calc loss
+
+        Args:
+            speech: (Batch, Length, ...)
+            speech_lengths: (Batch, )
+            text: (Batch, Length)
+            text_lengths: (Batch,)
+        """
+        assert text_lengths.dim() == 1, text_lengths.shape
+        # Check that batch_size is unified
+        assert (
+            speech.shape[0]
+            == speech_lengths.shape[0]
+            == text.shape[0]
+            == text_lengths.shape[0]
+        ), (speech.shape, speech_lengths.shape, text.shape, text_lengths.shape)
+        batch_size = speech.shape[0]
+
+        # For data-parallel
+        text = text[:, : text_lengths.max()]
+
+        # Define stats to report
+        loss_mlm, acc_mlm = None, None
+        loss_ctc, cer_ctc = None, None
+        stats = dict()
+
+        # 1. Encoder
+        encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
+        intermediate_outs = None
+        if isinstance(encoder_out, tuple):
+            intermediate_outs = encoder_out[1]
+            encoder_out = encoder_out[0]
+
+        # 2. CTC branch
+        if self.ctc_weight != 0.0:
+            loss_ctc, cer_ctc = self._calc_ctc_loss(
+                encoder_out, encoder_out_lens, text, text_lengths
+            )
+
+            # Collect CTC branch stats
+            stats["loss_ctc"] = loss_ctc.detach() if loss_ctc is not None else None
+            stats["cer_ctc"] = cer_ctc
+
+        # 2a. Intermediate CTC (optional)
+        loss_interctc = 0.0
+        if self.interctc_weight != 0.0 and intermediate_outs is not None:
+            for layer_idx, intermediate_out in intermediate_outs:
+                # we assume intermediate_out has the same length & padding
+                # as those of encoder_out
+                loss_ic, cer_ic = self._calc_ctc_loss(
+                    intermediate_out, encoder_out_lens, text, text_lengths
+                )
+                loss_interctc = loss_interctc + loss_ic
+
+                # Collect Intermedaite CTC stats
+                stats["loss_interctc_layer{}".format(layer_idx)] = (
+                    loss_ic.detach() if loss_ic is not None else None
+                )
+                stats["cer_interctc_layer{}".format(layer_idx)] = cer_ic
+
+            loss_interctc = loss_interctc / len(intermediate_outs)
+
+            # calculate whole encoder loss
+            loss_ctc = (
+                1 - self.interctc_weight
+            ) * loss_ctc + self.interctc_weight * loss_interctc
+
+        # 3. MLM decoder branch
+        if self.ctc_weight != 1.0:
+            loss_mlm, acc_mlm = self._calc_mlm_loss(
+                encoder_out, encoder_out_lens, text, text_lengths
+            )
+
+        # 4. CTC/MLM loss definition
+        if self.ctc_weight == 0.0:
+            loss = loss_mlm
+        elif self.ctc_weight == 1.0:
+            loss = loss_ctc
+        else:
+            loss = self.ctc_weight * loss_ctc + (1 - self.ctc_weight) * loss_mlm
+
+        # Collect MLM branch stats
+        stats["loss_mlm"] = loss_mlm.detach() if loss_mlm is not None else None
+        stats["acc_mlm"] = acc_mlm
+
+        # Collect total loss stats
+        stats["loss"] = loss.detach()
+
+        # force_gatherable: to-device and to-tensor if scalar for DataParallel
+        loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
+        return loss, stats, weight
+
+    def _calc_mlm_loss(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_out_lens: torch.Tensor,
+        ys_pad: torch.Tensor,
+        ys_pad_lens: torch.Tensor,
+    ):
+        # 1. Apply masks
+        ys_in_pad, ys_out_pad = mask_uniform(
+            ys_pad, self.mask_token, self.eos, self.ignore_id
+        )
+
+        # 2. Forward decoder
+        decoder_out, _ = self.decoder(
+            encoder_out, encoder_out_lens, ys_in_pad, ys_pad_lens
+        )
+
+        # 3. Compute mlm loss
+        loss_mlm = self.criterion_mlm(decoder_out, ys_out_pad)
+        acc_mlm = th_accuracy(
+            decoder_out.view(-1, self.vocab_size),
+            ys_out_pad,
+            ignore_label=self.ignore_id,
+        )
+
+        return loss_mlm, acc_mlm
+
+    def nll(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_out_lens: torch.Tensor,
+        ys_pad: torch.Tensor,
+        ys_pad_lens: torch.Tensor,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def batchify_nll(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_out_lens: torch.Tensor,
+        ys_pad: torch.Tensor,
+        ys_pad_lens: torch.Tensor,
+        batch_size: int = 100,
+    ):
+        raise NotImplementedError
+
+
+class MaskCTCInference(torch.nn.Module):
+    """Mask-CTC-based non-autoregressive inference"""
+
+    def __init__(
+        self,
+        asr_model: MaskCTCModel,
+        n_iterations: int,
+        threshold_probability: float,
+    ):
+        """Initialize Mask-CTC inference"""
+        super().__init__()
+        self.ctc = asr_model.ctc
+        self.mlm = asr_model.decoder
+        self.mask_token = asr_model.mask_token
+        self.n_iterations = n_iterations
+        self.threshold_probability = threshold_probability
+        self.converter = TokenIDConverter(token_list=asr_model.token_list)
+
+    def ids2text(self, ids: List[int]):
+        text = "".join(self.converter.ids2tokens(ids))
+        return text.replace("<mask>", "_").replace("<space>", " ")
+
+    def forward(self, enc_out: torch.Tensor) -> List[Hypothesis]:
+        """Perform Mask-CTC inference"""
+        # greedy ctc outputs
+        enc_out = enc_out.unsqueeze(0)
+        ctc_probs, ctc_ids = torch.exp(self.ctc.log_softmax(enc_out)).max(dim=-1)
+        y_hat = torch.stack([x[0] for x in groupby(ctc_ids[0])])
+        y_idx = torch.nonzero(y_hat != 0).squeeze(-1)
+
+        logging.info("ctc:{}".format(self.ids2text(y_hat[y_idx].tolist())))
+
+        # calculate token-level ctc probabilities by taking
+        # the maximum probability of consecutive frames with
+        # the same ctc symbols
+        probs_hat = []
+        cnt = 0
+        for i, y in enumerate(y_hat.tolist()):
+            probs_hat.append(-1)
+            while cnt < ctc_ids.shape[1] and y == ctc_ids[0][cnt]:
+                if probs_hat[i] < ctc_probs[0][cnt]:
+                    probs_hat[i] = ctc_probs[0][cnt].item()
+                cnt += 1
+        probs_hat = torch.from_numpy(numpy.array(probs_hat))
+
+        # mask ctc outputs based on ctc probabilities
+        p_thres = self.threshold_probability
+        mask_idx = torch.nonzero(probs_hat[y_idx] < p_thres).squeeze(-1)
+        confident_idx = torch.nonzero(probs_hat[y_idx] >= p_thres).squeeze(-1)
+        mask_num = len(mask_idx)
+
+        y_in = torch.zeros(1, len(y_idx), dtype=torch.long) + self.mask_token
+        y_in[0][confident_idx] = y_hat[y_idx][confident_idx]
+
+        logging.info("msk:{}".format(self.ids2text(y_in[0].tolist())))
+
+        # iterative decoding
+        if not mask_num == 0:
+            K = self.n_iterations
+            num_iter = K if mask_num >= K and K > 0 else mask_num
+
+            for t in range(num_iter - 1):
+                pred, _ = self.mlm(enc_out, [enc_out.size(1)], y_in, [y_in.size(1)])
+                pred_score, pred_id = pred[0][mask_idx].max(dim=-1)
+                cand = torch.topk(pred_score, mask_num // num_iter, -1)[1]
+                y_in[0][mask_idx[cand]] = pred_id[cand]
+                mask_idx = torch.nonzero(y_in[0] == self.mask_token).squeeze(-1)
+
+                logging.info("msk:{}".format(self.ids2text(y_in[0].tolist())))
+
+            # predict leftover masks (|masks| < mask_num // num_iter)
+            pred, _ = self.mlm(enc_out, [enc_out.size(1)], y_in, [y_in.size(1)])
+            y_in[0][mask_idx] = pred[0][mask_idx].argmax(dim=-1)
+
+            logging.info("msk:{}".format(self.ids2text(y_in[0].tolist())))
+
+        # pad with mask tokens to ensure compatibility with sos/eos tokens
+        yseq = torch.tensor(
+            [self.mask_token] + y_in.tolist()[0] + [self.mask_token], device=y_in.device
+        )
+
+        return Hypothesis(yseq=yseq)
diff --git a/espnet2/asr/specaug/specaug.py b/espnet2/asr/specaug/specaug.py
index 103fae49ef3..65ed221f220 100644
--- a/espnet2/asr/specaug/specaug.py
+++ b/espnet2/asr/specaug/specaug.py
@@ -1,9 +1,11 @@
 """SpecAugment module."""
+from typing import Optional
 from typing import Sequence
 from typing import Union
 
 from espnet2.asr.specaug.abs_specaug import AbsSpecAug
 from espnet2.layers.mask_along_axis import MaskAlongAxis
+from espnet2.layers.mask_along_axis import MaskAlongAxisVariableMaxWidth
 from espnet2.layers.time_warp import TimeWarp
 
 
@@ -30,12 +32,22 @@ def __init__(
         freq_mask_width_range: Union[int, Sequence[int]] = (0, 20),
         num_freq_mask: int = 2,
         apply_time_mask: bool = True,
-        time_mask_width_range: Union[int, Sequence[int]] = (0, 100),
+        time_mask_width_range: Optional[Union[int, Sequence[int]]] = None,
+        time_mask_width_ratio_range: Optional[Union[float, Sequence[float]]] = None,
         num_time_mask: int = 2,
     ):
         if not apply_time_warp and not apply_time_mask and not apply_freq_mask:
             raise ValueError(
-                "Either one of time_warp, time_mask, or freq_mask should be applied",
+                "Either one of time_warp, time_mask, or freq_mask should be applied"
+            )
+        if (
+            apply_time_mask
+            and (time_mask_width_range is not None)
+            and (time_mask_width_ratio_range is not None)
+        ):
+            raise ValueError(
+                'Either one of "time_mask_width_range" or '
+                '"time_mask_width_ratio_range" can be used'
             )
         super().__init__()
         self.apply_time_warp = apply_time_warp
@@ -57,11 +69,23 @@ def __init__(
             self.freq_mask = None
 
         if apply_time_mask:
-            self.time_mask = MaskAlongAxis(
-                dim="time",
-                mask_width_range=time_mask_width_range,
-                num_mask=num_time_mask,
-            )
+            if time_mask_width_range is not None:
+                self.time_mask = MaskAlongAxis(
+                    dim="time",
+                    mask_width_range=time_mask_width_range,
+                    num_mask=num_time_mask,
+                )
+            elif time_mask_width_ratio_range is not None:
+                self.time_mask = MaskAlongAxisVariableMaxWidth(
+                    dim="time",
+                    mask_width_ratio_range=time_mask_width_ratio_range,
+                    num_mask=num_time_mask,
+                )
+            else:
+                raise ValueError(
+                    'Either one of "time_mask_width_range" or '
+                    '"time_mask_width_ratio_range" should be used.'
+                )
         else:
             self.time_mask = None
 
diff --git a/espnet2/bin/asr_inference.py b/espnet2/bin/asr_inference.py
index 77647b1d240..fc6d75cb488 100755
--- a/espnet2/bin/asr_inference.py
+++ b/espnet2/bin/asr_inference.py
@@ -31,6 +31,7 @@
 from espnet2.asr.transducer.beam_search_transducer import Hypothesis as TransHypothesis
 from espnet2.fileio.datadir_writer import DatadirWriter
 from espnet2.tasks.asr import ASRTask
+from espnet2.tasks.enh_s2t import EnhS2TTask
 from espnet2.tasks.lm import LMTask
 from espnet2.text.build_tokenizer import build_tokenizer
 from espnet2.text.token_id_converter import TokenIDConverter
@@ -77,14 +78,29 @@ def __init__(
         penalty: float = 0.0,
         nbest: int = 1,
         streaming: bool = False,
+        enh_s2t_task: bool = False,
     ):
         assert check_argument_types()
 
+        task = ASRTask if not enh_s2t_task else EnhS2TTask
+
         # 1. Build ASR model
         scorers = {}
-        asr_model, asr_train_args = ASRTask.build_model_from_file(
+        asr_model, asr_train_args = task.build_model_from_file(
             asr_train_config, asr_model_file, device
         )
+        if enh_s2t_task:
+            asr_model.inherite_attributes(
+                inherite_s2t_attrs=[
+                    "ctc",
+                    "decoder",
+                    "eos",
+                    "joint_network",
+                    "sos",
+                    "token_list",
+                    "use_transducer_decoder",
+                ]
+            )
         asr_model.to(dtype=getattr(torch, dtype)).eval()
 
         decoder = asr_model.decoder
@@ -136,6 +152,7 @@ def __init__(
                 decoder=1.0 - ctc_weight,
                 ctc=ctc_weight,
                 lm=lm_weight,
+                ngram=ngram_weight,
                 length_bonus=penalty,
             )
             beam_search = BeamSearch(
@@ -245,6 +262,8 @@ def __call__(
 
         # b. Forward Encoder
         enc, _ = self.asr_model.encode(**batch)
+        if isinstance(enc, tuple):
+            enc = enc[0]
         assert len(enc) == 1, len(enc)
 
         # c. Passed the encoder result and the beam search
@@ -262,10 +281,11 @@ def __call__(
             assert isinstance(hyp, (Hypothesis, TransHypothesis)), type(hyp)
 
             # remove sos/eos and get results
+            last_pos = None if self.asr_model.use_transducer_decoder else -1
             if isinstance(hyp.yseq, list):
-                token_int = hyp.yseq[1:-1]
+                token_int = hyp.yseq[1:last_pos]
             else:
-                token_int = hyp.yseq[1:-1].tolist()
+                token_int = hyp.yseq[1:last_pos].tolist()
 
             # remove blank symbol id, which is assumed to be 0
             token_int = list(filter(lambda x: x != 0, token_int))
@@ -344,6 +364,7 @@ def inference(
     allow_variable_data_keys: bool,
     transducer_conf: Optional[dict],
     streaming: bool,
+    enh_s2t_task: bool,
 ):
     assert check_argument_types()
     if batch_size > 1:
@@ -387,6 +408,7 @@ def inference(
         penalty=penalty,
         nbest=nbest,
         streaming=streaming,
+        enh_s2t_task=enh_s2t_task,
     )
     speech2text = Speech2Text.from_pretrained(
         model_tag=model_tag,
@@ -528,6 +550,12 @@ def get_parser():
         help="Pretrained model tag. If specify this option, *_train_config and "
         "*_file will be overwritten",
     )
+    group.add_argument(
+        "--enh_s2t_task",
+        type=str2bool,
+        default=False,
+        help="enhancement and asr joint model",
+    )
 
     group = parser.add_argument_group("Beam-search related")
     group.add_argument(
diff --git a/espnet2/bin/asr_inference_maskctc.py b/espnet2/bin/asr_inference_maskctc.py
new file mode 100644
index 00000000000..20b857482f1
--- /dev/null
+++ b/espnet2/bin/asr_inference_maskctc.py
@@ -0,0 +1,384 @@
+#!/usr/bin/env python3
+import argparse
+import logging
+from pathlib import Path
+import sys
+from typing import Any
+from typing import Optional
+from typing import Sequence
+from typing import Tuple
+from typing import Union
+
+import numpy as np
+import torch
+from typeguard import check_argument_types
+from typeguard import check_return_type
+from typing import List
+
+from espnet.nets.beam_search import Hypothesis
+from espnet.nets.pytorch_backend.transformer.subsampling import TooShortUttError
+from espnet.utils.cli_utils import get_commandline_args
+from espnet2.asr.maskctc_model import MaskCTCInference
+from espnet2.fileio.datadir_writer import DatadirWriter
+from espnet2.tasks.asr import ASRTask
+from espnet2.text.build_tokenizer import build_tokenizer
+from espnet2.text.token_id_converter import TokenIDConverter
+from espnet2.torch_utils.device_funcs import to_device
+from espnet2.torch_utils.set_all_random_seed import set_all_random_seed
+from espnet2.utils import config_argparse
+from espnet2.utils.types import str2bool
+from espnet2.utils.types import str2triple_str
+from espnet2.utils.types import str_or_none
+
+
+class Speech2Text:
+    """Speech2Text class
+
+    Examples:
+        >>> import soundfile
+        >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
+        >>> audio, rate = soundfile.read("speech.wav")
+        >>> speech2text(audio)
+        [(text, token, token_int, hypothesis object), ...]
+
+    """
+
+    def __init__(
+        self,
+        asr_train_config: Union[Path, str],
+        asr_model_file: Union[Path, str] = None,
+        token_type: str = None,
+        bpemodel: str = None,
+        device: str = "cpu",
+        batch_size: int = 1,
+        dtype: str = "float32",
+        maskctc_n_iterations: int = 10,
+        maskctc_threshold_probability: float = 0.99,
+    ):
+        assert check_argument_types()
+
+        # 1. Build ASR model
+        asr_model, asr_train_args = ASRTask.build_model_from_file(
+            asr_train_config, asr_model_file, device
+        )
+        asr_model.to(dtype=getattr(torch, dtype)).eval()
+        token_list = asr_model.token_list
+
+        s2t = MaskCTCInference(
+            asr_model=asr_model,
+            n_iterations=maskctc_n_iterations,
+            threshold_probability=maskctc_threshold_probability,
+        )
+        s2t.to(device=device, dtype=getattr(torch, dtype)).eval()
+
+        # 2. [Optional] Build Text converter: e.g. bpe-sym -> Text
+        if token_type is None:
+            token_type = asr_train_args.token_type
+        if bpemodel is None:
+            bpemodel = asr_train_args.bpemodel
+
+        if token_type is None:
+            tokenizer = None
+        elif token_type == "bpe":
+            if bpemodel is not None:
+                tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel)
+            else:
+                tokenizer = None
+        else:
+            tokenizer = build_tokenizer(token_type=token_type)
+        converter = TokenIDConverter(token_list=token_list)
+        logging.info(f"Text tokenizer: {tokenizer}")
+
+        self.asr_model = asr_model
+        self.asr_train_args = asr_train_args
+        self.s2t = s2t
+        self.converter = converter
+        self.tokenizer = tokenizer
+        self.device = device
+        self.dtype = dtype
+
+    @torch.no_grad()
+    def __call__(
+        self, speech: Union[torch.Tensor, np.ndarray]
+    ) -> List[Tuple[Optional[str], List[str], List[int], Hypothesis]]:
+        """Inference
+
+        Args:
+            data: Input speech data
+        Returns:
+            text, token, token_int, hyp
+
+        """
+        assert check_argument_types()
+
+        # Input as audio signal
+        if isinstance(speech, np.ndarray):
+            speech = torch.tensor(speech)
+
+        # data: (Nsamples,) -> (1, Nsamples)
+        speech = speech.unsqueeze(0).to(getattr(torch, self.dtype))
+        # lenghts: (1,)
+        lengths = speech.new_full([1], dtype=torch.long, fill_value=speech.size(1))
+        batch = {"speech": speech, "speech_lengths": lengths}
+
+        # a. To device
+        batch = to_device(batch, device=self.device)
+
+        # b. Forward Encoder
+        enc, _ = self.asr_model.encode(**batch)
+        if isinstance(enc, tuple):
+            enc = enc[0]
+        assert len(enc) == 1, len(enc)
+
+        # c. Passed the encoder result and the inference algorithm
+        hyp = self.s2t(enc[0])
+        assert isinstance(hyp, Hypothesis), type(hyp)
+
+        # remove sos/eos and get results
+        token_int = hyp.yseq[1:-1].tolist()
+
+        # remove blank symbol id, which is assumed to be 0
+        token_int = list(filter(lambda x: x != 0, token_int))
+
+        # Change integer-ids to tokens
+        token = self.converter.ids2tokens(token_int)
+
+        if self.tokenizer is not None:
+            text = self.tokenizer.tokens2text(token)
+        else:
+            text = None
+        results = [(text, token, token_int, hyp)]
+
+        assert check_return_type(results)
+        return results
+
+    @staticmethod
+    def from_pretrained(
+        model_tag: Optional[str] = None,
+        **kwargs: Optional[Any],
+    ):
+        """Build Speech2Text instance from the pretrained model.
+
+        Args:
+            model_tag (Optional[str]): Model tag of the pretrained models.
+                Currently, the tags of espnet_model_zoo are supported.
+
+        Returns:
+            Speech2Text: Speech2Text instance.
+
+        """
+        if model_tag is not None:
+            try:
+                from espnet_model_zoo.downloader import ModelDownloader
+
+            except ImportError:
+                logging.error(
+                    "`espnet_model_zoo` is not installed. "
+                    "Please install via `pip install -U espnet_model_zoo`."
+                )
+                raise
+            d = ModelDownloader()
+            kwargs.update(**d.download_and_unpack(model_tag))
+
+        return Speech2Text(**kwargs)
+
+
+def inference(
+    output_dir: str,
+    batch_size: int,
+    dtype: str,
+    ngpu: int,
+    seed: int,
+    num_workers: int,
+    log_level: Union[int, str],
+    data_path_and_name_and_type: Sequence[Tuple[str, str, str]],
+    key_file: Optional[str],
+    asr_train_config: str,
+    asr_model_file: str,
+    model_tag: Optional[str],
+    token_type: Optional[str],
+    bpemodel: Optional[str],
+    allow_variable_data_keys: bool,
+    maskctc_n_iterations: int,
+    maskctc_threshold_probability: float,
+):
+    assert check_argument_types()
+    if batch_size > 1:
+        raise NotImplementedError("batch decoding is not implemented")
+    if ngpu > 1:
+        raise NotImplementedError("only single GPU decoding is supported")
+
+    logging.basicConfig(
+        level=log_level,
+        format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+    )
+
+    if ngpu >= 1:
+        device = "cuda"
+    else:
+        device = "cpu"
+
+    # 1. Set random-seed
+    set_all_random_seed(seed)
+
+    # 2. Build speech2text
+    speech2text_kwargs = dict(
+        asr_train_config=asr_train_config,
+        asr_model_file=asr_model_file,
+        token_type=token_type,
+        bpemodel=bpemodel,
+        device=device,
+        batch_size=batch_size,
+        dtype=dtype,
+        maskctc_n_iterations=maskctc_n_iterations,
+        maskctc_threshold_probability=maskctc_threshold_probability,
+    )
+    speech2text = Speech2Text.from_pretrained(
+        model_tag=model_tag,
+        **speech2text_kwargs,
+    )
+
+    # 3. Build data-iterator
+    loader = ASRTask.build_streaming_iterator(
+        data_path_and_name_and_type,
+        dtype=dtype,
+        batch_size=batch_size,
+        key_file=key_file,
+        num_workers=num_workers,
+        preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False),
+        collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args, False),
+        allow_variable_data_keys=allow_variable_data_keys,
+        inference=True,
+    )
+
+    # 7 .Start for-loop
+    with DatadirWriter(output_dir) as writer:
+        for keys, batch in loader:
+            assert isinstance(batch, dict), type(batch)
+            assert all(isinstance(s, str) for s in keys), keys
+            _bs = len(next(iter(batch.values())))
+            assert len(keys) == _bs, f"{len(keys)} != {_bs}"
+            batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}
+
+            try:
+                results = speech2text(**batch)
+            except TooShortUttError as e:
+                logging.warning(f"Utterance {keys} {e}")
+                hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
+                results = [[" ", ["<space>"], [2], hyp]]
+
+            # Only supporting batch_size==1
+            key = keys[0]
+            (text, token, token_int, hyp) = results[0]
+
+            # Create a directory: outdir/{n}best_recog
+            ibest_writer = writer["1best_recog"]
+
+            # Write the result to each file
+            ibest_writer["token"][key] = " ".join(token)
+            ibest_writer["token_int"][key] = " ".join(map(str, token_int))
+            ibest_writer["score"][key] = str(hyp.score)
+
+            if text is not None:
+                ibest_writer["text"][key] = text
+
+
+def get_parser():
+    parser = config_argparse.ArgumentParser(
+        description="ASR Decoding",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    # Note(kamo): Use '_' instead of '-' as separator.
+    # '-' is confusing if written in yaml.
+    parser.add_argument(
+        "--log_level",
+        type=lambda x: x.upper(),
+        default="INFO",
+        choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
+        help="The verbose level of logging",
+    )
+
+    parser.add_argument("--output_dir", type=str, required=True)
+    parser.add_argument(
+        "--ngpu",
+        type=int,
+        default=0,
+        help="The number of gpus. 0 indicates CPU mode",
+    )
+    parser.add_argument("--seed", type=int, default=0, help="Random seed")
+    parser.add_argument(
+        "--dtype",
+        default="float32",
+        choices=["float16", "float32", "float64"],
+        help="Data type",
+    )
+    parser.add_argument(
+        "--num_workers",
+        type=int,
+        default=1,
+        help="The number of workers used for DataLoader",
+    )
+
+    group = parser.add_argument_group("Input data related")
+    group.add_argument(
+        "--data_path_and_name_and_type",
+        type=str2triple_str,
+        required=True,
+        action="append",
+    )
+    group.add_argument("--key_file", type=str_or_none)
+    group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
+
+    group = parser.add_argument_group("The model configuration related")
+    group.add_argument("--asr_train_config", type=str, required=True)
+    group.add_argument("--asr_model_file", type=str, required=True)
+    group.add_argument(
+        "--model_tag",
+        type=str,
+        help="Pretrained model tag. If specify this option, *_train_config and "
+        "*_file will be overwritten",
+    )
+
+    group = parser.add_argument_group("Decoding related")
+    group.add_argument(
+        "--batch_size",
+        type=int,
+        default=1,
+        help="The batch size for inference",
+    )
+    group.add_argument("--maskctc_n_iterations", type=int, default=10)
+    group.add_argument("--maskctc_threshold_probability", type=float, default=0.99)
+
+    group = parser.add_argument_group("Text converter related")
+    group.add_argument(
+        "--token_type",
+        type=str_or_none,
+        default=None,
+        choices=["char", "bpe", None],
+        help="The token type for ASR model. "
+        "If not given, refers from the training args",
+    )
+    group.add_argument(
+        "--bpemodel",
+        type=str_or_none,
+        default=None,
+        help="The model path of sentencepiece. "
+        "If not given, refers from the training args",
+    )
+
+    return parser
+
+
+def main(cmd=None):
+    print(get_commandline_args(), file=sys.stderr)
+    parser = get_parser()
+    args = parser.parse_args(cmd)
+    kwargs = vars(args)
+    kwargs.pop("config", None)
+
+    inference(**kwargs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/espnet2/bin/asr_inference_streaming.py b/espnet2/bin/asr_inference_streaming.py
index 215307d758c..4eebbb6e2a2 100755
--- a/espnet2/bin/asr_inference_streaming.py
+++ b/espnet2/bin/asr_inference_streaming.py
@@ -41,6 +41,9 @@
 class Speech2TextStreaming:
     """Speech2TextStreaming class
 
+    Details in "Streaming Transformer ASR with Blockwise Synchronous Beam Search"
+    (https://arxiv.org/abs/2006.14941)
+
     Examples:
         >>> import soundfile
         >>> speech2text = Speech2TextStreaming("asr_config.yml", "asr.pth")
diff --git a/espnet2/bin/enh_inference.py b/espnet2/bin/enh_inference.py
index 84a37b5ff7f..2deed3250c5 100755
--- a/espnet2/bin/enh_inference.py
+++ b/espnet2/bin/enh_inference.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 import argparse
+from itertools import chain
 import logging
 from pathlib import Path
 import sys
@@ -15,6 +16,7 @@
 import torch
 from tqdm import trange
 from typeguard import check_argument_types
+import yaml
 
 from espnet.utils.cli_utils import get_commandline_args
 from espnet2.enh.loss.criterions.tf_domain import FrequencyDomainMSE
@@ -22,8 +24,10 @@
 from espnet2.enh.loss.wrappers.pit_solver import PITSolver
 from espnet2.fileio.sound_scp import SoundScpWriter
 from espnet2.tasks.enh import EnhancementTask
+from espnet2.tasks.enh_s2t import EnhS2TTask
 from espnet2.torch_utils.device_funcs import to_device
 from espnet2.torch_utils.set_all_random_seed import set_all_random_seed
+from espnet2.train.abs_espnet_model import AbsESPnetModel
 from espnet2.utils import config_argparse
 from espnet2.utils.types import str2bool
 from espnet2.utils.types import str2triple_str
@@ -33,6 +37,59 @@
 EPS = torch.finfo(torch.get_default_dtype()).eps
 
 
+def get_train_config(train_config, model_file=None):
+    if train_config is None:
+        assert model_file is not None, (
+            "The argument 'model_file' must be provided "
+            "if the argument 'train_config' is not specified."
+        )
+        train_config = Path(model_file).parent / "config.yaml"
+    else:
+        train_config = Path(train_config)
+    return train_config
+
+
+def recursive_dict_update(dict_org, dict_patch, verbose=False, log_prefix=""):
+    """Update `dict_org` with `dict_patch` in-place recursively."""
+    for key, value in dict_patch.items():
+        if key not in dict_org:
+            if verbose:
+                logging.info(
+                    "Overwriting config: [{}{}]: None -> {}".format(
+                        log_prefix, key, value
+                    )
+                )
+            dict_org[key] = value
+        elif isinstance(value, dict):
+            recursive_dict_update(
+                dict_org[key], value, verbose=verbose, log_prefix=f"{key}."
+            )
+        else:
+            if verbose and dict_org[key] != value:
+                logging.info(
+                    "Overwriting config: [{}{}]: {} -> {}".format(
+                        log_prefix, key, dict_org[key], value
+                    )
+                )
+            dict_org[key] = value
+
+
+def build_model_from_args_and_file(task, args, model_file, device):
+    model = task.build_model(args)
+    if not isinstance(model, AbsESPnetModel):
+        raise RuntimeError(
+            f"model must inherit {AbsESPnetModel.__name__}, but got {type(model)}"
+        )
+    model.to(device)
+    if model_file is not None:
+        if device == "cuda":
+            # NOTE(kamo): "cuda" for torch.load always indicates cuda:0
+            #   in PyTorch<=1.4
+            device = f"cuda:{torch.cuda.current_device()}"
+        model.load_state_dict(torch.load(model_file, map_location=device))
+    return model
+
+
 class SeparateSpeech:
     """SeparateSpeech class
 
@@ -49,6 +106,7 @@ def __init__(
         self,
         train_config: Union[Path, str] = None,
         model_file: Union[Path, str] = None,
+        inference_config: Union[Path, str] = None,
         segment_size: Optional[float] = None,
         hop_size: Optional[float] = None,
         normalize_segment_scale: bool = False,
@@ -57,13 +115,47 @@ def __init__(
         normalize_output_wav: bool = False,
         device: str = "cpu",
         dtype: str = "float32",
+        enh_s2t_task: bool = False,
     ):
         assert check_argument_types()
 
+        task = EnhancementTask if not enh_s2t_task else EnhS2TTask
+
         # 1. Build Enh model
-        enh_model, enh_train_args = EnhancementTask.build_model_from_file(
-            train_config, model_file, device
-        )
+
+        if inference_config is None:
+            enh_model, enh_train_args = task.build_model_from_file(
+                train_config, model_file, device
+            )
+        else:
+            # Overwrite model attributes
+            train_config = get_train_config(train_config, model_file=model_file)
+            with train_config.open("r", encoding="utf-8") as f:
+                train_args = yaml.safe_load(f)
+
+            with Path(inference_config).open("r", encoding="utf-8") as f:
+                infer_args = yaml.safe_load(f)
+
+            if enh_s2t_task:
+                arg_list = ("enh_encoder", "enh_separator", "enh_decoder")
+            else:
+                arg_list = ("encoder", "separator", "decoder")
+            supported_keys = list(chain(*[[k, k + "_conf"] for k in arg_list]))
+            for k in infer_args.keys():
+                if k not in supported_keys:
+                    raise ValueError(
+                        "Only the following top-level keys are supported: %s"
+                        % ", ".join(supported_keys)
+                    )
+
+            recursive_dict_update(train_args, infer_args, verbose=True)
+            enh_train_args = argparse.Namespace(**train_args)
+            enh_model = build_model_from_args_and_file(
+                task, enh_train_args, model_file, device
+            )
+
+        if enh_s2t_task:
+            enh_model = enh_model.enh_model
         enh_model.to(dtype=getattr(torch, dtype)).eval()
 
         self.device = device
@@ -305,6 +397,7 @@ def inference(
     train_config: Optional[str],
     model_file: Optional[str],
     model_tag: Optional[str],
+    inference_config: Optional[str],
     allow_variable_data_keys: bool,
     segment_size: Optional[float],
     hop_size: Optional[float],
@@ -312,6 +405,7 @@ def inference(
     show_progressbar: bool,
     ref_channel: Optional[int],
     normalize_output_wav: bool,
+    enh_s2t_task: bool,
 ):
     assert check_argument_types()
     if batch_size > 1:
@@ -336,6 +430,7 @@ def inference(
     separate_speech_kwargs = dict(
         train_config=train_config,
         model_file=model_file,
+        inference_config=inference_config,
         segment_size=segment_size,
         hop_size=hop_size,
         normalize_segment_scale=normalize_segment_scale,
@@ -344,6 +439,7 @@ def inference(
         normalize_output_wav=normalize_output_wav,
         device=device,
         dtype=dtype,
+        enh_s2t_task=enh_s2t_task,
     )
     separate_speech = SeparateSpeech.from_pretrained(
         model_tag=model_tag,
@@ -368,13 +464,15 @@ def inference(
     )
 
     # 4. Start for-loop
+    output_dir = Path(output_dir).expanduser().resolve()
     writers = []
     for i in range(separate_speech.num_spk):
         writers.append(
             SoundScpWriter(f"{output_dir}/wavs/{i + 1}", f"{output_dir}/spk{i + 1}.scp")
         )
 
-    for keys, batch in loader:
+    for i, (keys, batch) in enumerate(loader):
+        logging.info(f"[{i}] Enhancing {keys}")
         assert isinstance(batch, dict), type(batch)
         assert all(isinstance(s, str) for s in keys), keys
         _bs = len(next(iter(batch.values())))
@@ -465,6 +563,19 @@ def get_parser():
         help="Pretrained model tag. If specify this option, train_config and "
         "model_file will be overwritten",
     )
+    group.add_argument(
+        "--inference_config",
+        type=str_or_none,
+        default=None,
+        help="Optional configuration file for overwriting enh model attributes "
+        "during inference",
+    )
+    group.add_argument(
+        "--enh_s2t_task",
+        type=str2bool,
+        default=False,
+        help="enhancement and asr joint model",
+    )
 
     group = parser.add_argument_group("Data loading related")
     group.add_argument(
diff --git a/espnet2/bin/enh_s2t_train.py b/espnet2/bin/enh_s2t_train.py
new file mode 100755
index 00000000000..93194d3696d
--- /dev/null
+++ b/espnet2/bin/enh_s2t_train.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python3
+from espnet2.tasks.enh_s2t import EnhS2TTask
+
+
+def get_parser():
+    parser = EnhS2TTask.get_parser()
+    return parser
+
+
+def main(cmd=None):
+    r"""EnhS2T training.
+
+    Example:
+
+        % python enh_s2t_train.py enh_s2t --print_config --optim adadelta \
+                > conf/train_enh_s2t.yaml
+        % python enh_s2t_train.py --config conf/train_enh_s2t.yaml
+    """
+    EnhS2TTask.main(cmd=cmd)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/espnet2/bin/enh_scoring.py b/espnet2/bin/enh_scoring.py
index a0c9b3c32f6..1c42fbf1f6d 100755
--- a/espnet2/bin/enh_scoring.py
+++ b/espnet2/bin/enh_scoring.py
@@ -63,29 +63,29 @@ def scoring(
             if ref.ndim > inf.ndim:
                 # multi-channel reference and single-channel output
                 ref = ref[..., ref_channel]
-                assert ref.shape == inf.shape, (ref.shape, inf.shape)
             elif ref.ndim < inf.ndim:
                 # single-channel reference and multi-channel output
-                raise ValueError(
-                    "Reference must be multi-channel when the \
-                    network output is multi-channel."
-                )
+                inf = inf[..., ref_channel]
             elif ref.ndim == inf.ndim == 3:
                 # multi-channel reference and output
                 ref = ref[..., ref_channel]
                 inf = inf[..., ref_channel]
-
+            assert ref.shape == inf.shape, (ref.shape, inf.shape)
             sdr, sir, sar, perm = bss_eval_sources(ref, inf, compute_permutation=True)
 
             for i in range(num_spk):
                 stoi_score = stoi(ref[i], inf[int(perm[i])], fs_sig=sample_rate)
+                estoi_score = stoi(
+                    ref[i], inf[int(perm[i])], fs_sig=sample_rate, extended=True
+                )
                 si_snr_score = -float(
                     si_snr_loss(
                         torch.from_numpy(ref[i][None, ...]),
                         torch.from_numpy(inf[int(perm[i])][None, ...]),
                     )
                 )
-                writer[f"STOI_spk{i + 1}"][key] = str(stoi_score)
+                writer[f"STOI_spk{i + 1}"][key] = str(stoi_score * 100)  # in percentage
+                writer[f"ESTOI_spk{i + 1}"][key] = str(estoi_score * 100)
                 writer[f"SI_SNR_spk{i + 1}"][key] = str(si_snr_score)
                 writer[f"SDR_spk{i + 1}"][key] = str(sdr[i])
                 writer[f"SAR_spk{i + 1}"][key] = str(sar[i])
diff --git a/espnet2/bin/enh_train.py b/espnet2/bin/enh_train.py
index ca4708eb87e..9f535fcce6b 100755
--- a/espnet2/bin/enh_train.py
+++ b/espnet2/bin/enh_train.py
@@ -12,7 +12,7 @@ def main(cmd=None):
 
     Example:
 
-        % python enh_train.py asr --print_config --optim adadelta \
+        % python enh_train.py enh --print_config --optim adadelta \
                 > conf/train_enh.yaml
         % python enh_train.py --config conf/train_enh.yaml
     """
diff --git a/espnet2/bin/mt_inference.py b/espnet2/bin/mt_inference.py
new file mode 100755
index 00000000000..e523e1e6d47
--- /dev/null
+++ b/espnet2/bin/mt_inference.py
@@ -0,0 +1,537 @@
+#!/usr/bin/env python3
+import argparse
+import logging
+from pathlib import Path
+import sys
+from typing import Any
+from typing import Optional
+from typing import Sequence
+from typing import Tuple
+from typing import Union
+
+import numpy as np
+import torch
+from typeguard import check_argument_types
+from typeguard import check_return_type
+from typing import List
+
+from espnet.nets.batch_beam_search import BatchBeamSearch
+from espnet.nets.beam_search import BeamSearch
+from espnet.nets.beam_search import Hypothesis
+from espnet.nets.pytorch_backend.transformer.subsampling import TooShortUttError
+from espnet.nets.scorer_interface import BatchScorerInterface
+from espnet.nets.scorers.length_bonus import LengthBonus
+from espnet.utils.cli_utils import get_commandline_args
+from espnet2.fileio.datadir_writer import DatadirWriter
+from espnet2.tasks.lm import LMTask
+from espnet2.tasks.mt import MTTask
+from espnet2.text.build_tokenizer import build_tokenizer
+from espnet2.text.token_id_converter import TokenIDConverter
+from espnet2.torch_utils.device_funcs import to_device
+from espnet2.torch_utils.set_all_random_seed import set_all_random_seed
+from espnet2.utils import config_argparse
+from espnet2.utils.types import str2bool
+from espnet2.utils.types import str2triple_str
+from espnet2.utils.types import str_or_none
+
+
+class Text2Text:
+    """Text2Text class
+
+    Examples:
+        >>> text2text = Text2Text("mt_config.yml", "mt.pth")
+        >>> text2text(audio)
+        [(text, token, token_int, hypothesis object), ...]
+
+    """
+
+    def __init__(
+        self,
+        mt_train_config: Union[Path, str] = None,
+        mt_model_file: Union[Path, str] = None,
+        lm_train_config: Union[Path, str] = None,
+        lm_file: Union[Path, str] = None,
+        ngram_scorer: str = "full",
+        ngram_file: Union[Path, str] = None,
+        token_type: str = None,
+        bpemodel: str = None,
+        device: str = "cpu",
+        maxlenratio: float = 0.0,
+        minlenratio: float = 0.0,
+        batch_size: int = 1,
+        dtype: str = "float32",
+        beam_size: int = 20,
+        lm_weight: float = 1.0,
+        ngram_weight: float = 0.9,
+        penalty: float = 0.0,
+        nbest: int = 1,
+    ):
+        assert check_argument_types()
+
+        # 1. Build MT model
+        scorers = {}
+        mt_model, mt_train_args = MTTask.build_model_from_file(
+            mt_train_config, mt_model_file, device
+        )
+        mt_model.to(dtype=getattr(torch, dtype)).eval()
+
+        decoder = mt_model.decoder
+        token_list = mt_model.token_list
+        scorers.update(
+            decoder=decoder,
+            length_bonus=LengthBonus(len(token_list)),
+        )
+
+        # 2. Build Language model
+        if lm_train_config is not None:
+            lm, lm_train_args = LMTask.build_model_from_file(
+                lm_train_config, lm_file, device
+            )
+            scorers["lm"] = lm.lm
+
+        # 3. Build ngram model
+        if ngram_file is not None:
+            if ngram_scorer == "full":
+                from espnet.nets.scorers.ngram import NgramFullScorer
+
+                ngram = NgramFullScorer(ngram_file, token_list)
+            else:
+                from espnet.nets.scorers.ngram import NgramPartScorer
+
+                ngram = NgramPartScorer(ngram_file, token_list)
+        else:
+            ngram = None
+        scorers["ngram"] = ngram
+
+        # 4. Build BeamSearch object
+        weights = dict(
+            decoder=1.0,
+            lm=lm_weight,
+            ngram=ngram_weight,
+            length_bonus=penalty,
+        )
+        beam_search = BeamSearch(
+            beam_size=beam_size,
+            weights=weights,
+            scorers=scorers,
+            sos=mt_model.sos,
+            eos=mt_model.eos,
+            vocab_size=len(token_list),
+            token_list=token_list,
+            pre_beam_score_key="full",
+        )
+        # TODO(karita): make all scorers batchfied
+        if batch_size == 1:
+            non_batch = [
+                k
+                for k, v in beam_search.full_scorers.items()
+                if not isinstance(v, BatchScorerInterface)
+            ]
+            if len(non_batch) == 0:
+                beam_search.__class__ = BatchBeamSearch
+                logging.info("BatchBeamSearch implementation is selected.")
+            else:
+                logging.warning(
+                    f"As non-batch scorers {non_batch} are found, "
+                    f"fall back to non-batch implementation."
+                )
+        beam_search.to(device=device, dtype=getattr(torch, dtype)).eval()
+        for scorer in scorers.values():
+            if isinstance(scorer, torch.nn.Module):
+                scorer.to(device=device, dtype=getattr(torch, dtype)).eval()
+        logging.info(f"Beam_search: {beam_search}")
+        logging.info(f"Decoding device={device}, dtype={dtype}")
+
+        # 4. [Optional] Build Text converter: e.g. bpe-sym -> Text
+        if token_type is None:
+            token_type = mt_train_args.token_type
+        if bpemodel is None:
+            bpemodel = mt_train_args.bpemodel
+
+        if token_type is None:
+            tokenizer = None
+        elif token_type == "bpe":
+            if bpemodel is not None:
+                tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel)
+            else:
+                tokenizer = None
+        else:
+            tokenizer = build_tokenizer(token_type=token_type)
+        converter = TokenIDConverter(token_list=token_list)
+        logging.info(f"Text tokenizer: {tokenizer}")
+
+        self.mt_model = mt_model
+        self.mt_train_args = mt_train_args
+        self.converter = converter
+        self.tokenizer = tokenizer
+        self.beam_search = beam_search
+        self.maxlenratio = maxlenratio
+        self.minlenratio = minlenratio
+        self.device = device
+        self.dtype = dtype
+        self.nbest = nbest
+
+    @torch.no_grad()
+    def __call__(
+        self, src_text: Union[torch.Tensor, np.ndarray]
+    ) -> List[Tuple[Optional[str], List[str], List[int], Hypothesis]]:
+        """Inference
+
+        Args:
+            data: Input text data
+        Returns:
+            text, token, token_int, hyp
+
+        """
+        assert check_argument_types()
+
+        # Input as audio signal
+        if isinstance(src_text, np.ndarray):
+            src_text = torch.tensor(src_text)
+
+        # data: (Nsamples,) -> (1, Nsamples)
+        src_text = src_text.unsqueeze(0).to(torch.long)
+        # lengths: (1,)
+        lengths = src_text.new_full([1], dtype=torch.long, fill_value=src_text.size(1))
+        batch = {"src_text": src_text, "src_text_lengths": lengths}
+
+        # a. To device
+        batch = to_device(batch, device=self.device)
+
+        # b. Forward Encoder
+        enc, _ = self.mt_model.encode(**batch)
+        assert len(enc) == 1, len(enc)
+
+        # c. Passed the encoder result and the beam search
+        nbest_hyps = self.beam_search(
+            x=enc[0], maxlenratio=self.maxlenratio, minlenratio=self.minlenratio
+        )
+        nbest_hyps = nbest_hyps[: self.nbest]
+
+        results = []
+        for hyp in nbest_hyps:
+            assert isinstance(hyp, Hypothesis), type(hyp)
+
+            # remove sos/eos and get results
+            # token_int = hyp.yseq[1:-1].tolist()
+            # TODO(sdalmia): check why the above line doesn't work
+            token_int = hyp.yseq.tolist()
+            token_int = list(filter(lambda x: x != self.mt_model.sos, token_int))
+            token_int = list(filter(lambda x: x != self.mt_model.eos, token_int))
+
+            # remove blank symbol id, which is assumed to be 0
+            token_int = list(filter(lambda x: x != 0, token_int))
+
+            # Change integer-ids to tokens
+            token = self.converter.ids2tokens(token_int)
+
+            if self.tokenizer is not None:
+                text = self.tokenizer.tokens2text(token)
+            else:
+                text = None
+            results.append((text, token, token_int, hyp))
+
+        assert check_return_type(results)
+        return results
+
+    @staticmethod
+    def from_pretrained(
+        model_tag: Optional[str] = None,
+        **kwargs: Optional[Any],
+    ):
+        """Build Text2Text instance from the pretrained model.
+
+        Args:
+            model_tag (Optional[str]): Model tag of the pretrained models.
+                Currently, the tags of espnet_model_zoo are supported.
+        Returns:
+            Text2Text: Text2Text instance.
+
+        """
+        if model_tag is not None:
+            try:
+                from espnet_model_zoo.downloader import ModelDownloader
+
+            except ImportError:
+                logging.error(
+                    "`espnet_model_zoo` is not installed. "
+                    "Please install via `pip install -U espnet_model_zoo`."
+                )
+                raise
+            d = ModelDownloader()
+            kwargs.update(**d.download_and_unpack(model_tag))
+
+        return Text2Text(**kwargs)
+
+
+def inference(
+    output_dir: str,
+    maxlenratio: float,
+    minlenratio: float,
+    batch_size: int,
+    dtype: str,
+    beam_size: int,
+    ngpu: int,
+    seed: int,
+    lm_weight: float,
+    ngram_weight: float,
+    penalty: float,
+    nbest: int,
+    num_workers: int,
+    log_level: Union[int, str],
+    data_path_and_name_and_type: Sequence[Tuple[str, str, str]],
+    key_file: Optional[str],
+    mt_train_config: Optional[str],
+    mt_model_file: Optional[str],
+    lm_train_config: Optional[str],
+    lm_file: Optional[str],
+    word_lm_train_config: Optional[str],
+    word_lm_file: Optional[str],
+    ngram_file: Optional[str],
+    model_tag: Optional[str],
+    token_type: Optional[str],
+    bpemodel: Optional[str],
+    allow_variable_data_keys: bool,
+):
+    assert check_argument_types()
+    if batch_size > 1:
+        raise NotImplementedError("batch decoding is not implemented")
+    if word_lm_train_config is not None:
+        raise NotImplementedError("Word LM is not implemented")
+    if ngpu > 1:
+        raise NotImplementedError("only single GPU decoding is supported")
+
+    logging.basicConfig(
+        level=log_level,
+        format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+    )
+
+    if ngpu >= 1:
+        device = "cuda"
+    else:
+        device = "cpu"
+
+    # 1. Set random-seed
+    set_all_random_seed(seed)
+
+    # 2. Build text2text
+    text2text_kwargs = dict(
+        mt_train_config=mt_train_config,
+        mt_model_file=mt_model_file,
+        lm_train_config=lm_train_config,
+        lm_file=lm_file,
+        ngram_file=ngram_file,
+        token_type=token_type,
+        bpemodel=bpemodel,
+        device=device,
+        maxlenratio=maxlenratio,
+        minlenratio=minlenratio,
+        dtype=dtype,
+        beam_size=beam_size,
+        lm_weight=lm_weight,
+        ngram_weight=ngram_weight,
+        penalty=penalty,
+        nbest=nbest,
+    )
+    text2text = Text2Text.from_pretrained(
+        model_tag=model_tag,
+        **text2text_kwargs,
+    )
+
+    # 3. Build data-iterator
+    loader = MTTask.build_streaming_iterator(
+        data_path_and_name_and_type,
+        dtype=dtype,
+        batch_size=batch_size,
+        key_file=key_file,
+        num_workers=num_workers,
+        preprocess_fn=MTTask.build_preprocess_fn(text2text.mt_train_args, False),
+        collate_fn=MTTask.build_collate_fn(text2text.mt_train_args, False),
+        allow_variable_data_keys=allow_variable_data_keys,
+        inference=True,
+    )
+
+    # 7 .Start for-loop
+    # FIXME(kamo): The output format should be discussed about
+    with DatadirWriter(output_dir) as writer:
+        for keys, batch in loader:
+            assert isinstance(batch, dict), type(batch)
+            assert all(isinstance(s, str) for s in keys), keys
+            _bs = len(next(iter(batch.values())))
+            assert len(keys) == _bs, f"{len(keys)} != {_bs}"
+            batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}
+
+            # N-best list of (text, token, token_int, hyp_object)
+            try:
+                results = text2text(**batch)
+            except TooShortUttError as e:
+                logging.warning(f"Utterance {keys} {e}")
+                hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
+                results = [[" ", ["<space>"], [2], hyp]] * nbest
+
+            # Only supporting batch_size==1
+            key = keys[0]
+            for n, (text, token, token_int, hyp) in zip(range(1, nbest + 1), results):
+                # Create a directory: outdir/{n}best_recog
+                ibest_writer = writer[f"{n}best_recog"]
+
+                # Write the result to each file
+                ibest_writer["token"][key] = " ".join(token)
+                ibest_writer["token_int"][key] = " ".join(map(str, token_int))
+                ibest_writer["score"][key] = str(hyp.score)
+
+                if text is not None:
+                    ibest_writer["text"][key] = text
+
+
+def get_parser():
+    parser = config_argparse.ArgumentParser(
+        description="MT Decoding",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    # Note(kamo): Use '_' instead of '-' as separator.
+    # '-' is confusing if written in yaml.
+    parser.add_argument(
+        "--log_level",
+        type=lambda x: x.upper(),
+        default="INFO",
+        choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
+        help="The verbose level of logging",
+    )
+
+    parser.add_argument("--output_dir", type=str, required=True)
+    parser.add_argument(
+        "--ngpu",
+        type=int,
+        default=0,
+        help="The number of gpus. 0 indicates CPU mode",
+    )
+    parser.add_argument("--seed", type=int, default=0, help="Random seed")
+    parser.add_argument(
+        "--dtype",
+        default="float32",
+        choices=["float16", "float32", "float64"],
+        help="Data type",
+    )
+    parser.add_argument(
+        "--num_workers",
+        type=int,
+        default=1,
+        help="The number of workers used for DataLoader",
+    )
+
+    group = parser.add_argument_group("Input data related")
+    group.add_argument(
+        "--data_path_and_name_and_type",
+        type=str2triple_str,
+        required=True,
+        action="append",
+    )
+    group.add_argument("--key_file", type=str_or_none)
+    group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
+
+    group = parser.add_argument_group("The model configuration related")
+    group.add_argument(
+        "--mt_train_config",
+        type=str,
+        help="ST training configuration",
+    )
+    group.add_argument(
+        "--mt_model_file",
+        type=str,
+        help="MT model parameter file",
+    )
+    group.add_argument(
+        "--lm_train_config",
+        type=str,
+        help="LM training configuration",
+    )
+    group.add_argument(
+        "--lm_file",
+        type=str,
+        help="LM parameter file",
+    )
+    group.add_argument(
+        "--word_lm_train_config",
+        type=str,
+        help="Word LM training configuration",
+    )
+    group.add_argument(
+        "--word_lm_file",
+        type=str,
+        help="Word LM parameter file",
+    )
+    group.add_argument(
+        "--ngram_file",
+        type=str,
+        help="N-gram parameter file",
+    )
+    group.add_argument(
+        "--model_tag",
+        type=str,
+        help="Pretrained model tag. If specify this option, *_train_config and "
+        "*_file will be overwritten",
+    )
+
+    group = parser.add_argument_group("Beam-search related")
+    group.add_argument(
+        "--batch_size",
+        type=int,
+        default=1,
+        help="The batch size for inference",
+    )
+    group.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
+    group.add_argument("--beam_size", type=int, default=20, help="Beam size")
+    group.add_argument("--penalty", type=float, default=0.0, help="Insertion penalty")
+    group.add_argument(
+        "--maxlenratio",
+        type=float,
+        default=0.0,
+        help="Input length ratio to obtain max output length. "
+        "If maxlenratio=0.0 (default), it uses a end-detect "
+        "function "
+        "to automatically find maximum hypothesis lengths."
+        "If maxlenratio<0.0, its absolute value is interpreted"
+        "as a constant max output length",
+    )
+    group.add_argument(
+        "--minlenratio",
+        type=float,
+        default=0.0,
+        help="Input length ratio to obtain min output length",
+    )
+    group.add_argument("--lm_weight", type=float, default=1.0, help="RNNLM weight")
+    group.add_argument("--ngram_weight", type=float, default=0.9, help="ngram weight")
+
+    group = parser.add_argument_group("Text converter related")
+    group.add_argument(
+        "--token_type",
+        type=str_or_none,
+        default=None,
+        choices=["char", "bpe", None],
+        help="The token type for ST model. "
+        "If not given, refers from the training args",
+    )
+    group.add_argument(
+        "--bpemodel",
+        type=str_or_none,
+        default=None,
+        help="The model path of sentencepiece. "
+        "If not given, refers from the training args",
+    )
+
+    return parser
+
+
+def main(cmd=None):
+    print(get_commandline_args(), file=sys.stderr)
+    parser = get_parser()
+    args = parser.parse_args(cmd)
+    kwargs = vars(args)
+    kwargs.pop("config", None)
+    inference(**kwargs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/espnet2/bin/mt_train.py b/espnet2/bin/mt_train.py
new file mode 100755
index 00000000000..03739be8623
--- /dev/null
+++ b/espnet2/bin/mt_train.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python3
+from espnet2.tasks.mt import MTTask
+
+
+def get_parser():
+    parser = MTTask.get_parser()
+    return parser
+
+
+def main(cmd=None):
+    r"""MT training.
+
+    Example:
+
+        % python mt_train.py st --print_config --optim adadelta \
+                > conf/train_mt.yaml
+        % python mt_train.py --config conf/train_mt.yaml
+    """
+    MTTask.main(cmd=cmd)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/espnet2/bin/pack.py b/espnet2/bin/pack.py
index c3de42dac40..e6492445f7a 100755
--- a/espnet2/bin/pack.py
+++ b/espnet2/bin/pack.py
@@ -16,6 +16,11 @@ class ASRPackedContents(PackedContents):
     yaml_files = ["asr_train_config", "lm_train_config"]
 
 
+class STPackedContents(PackedContents):
+    files = ["st_model_file"]
+    yaml_files = ["st_train_config"]
+
+
 class TTSPackedContents(PackedContents):
     files = ["model_file"]
     yaml_files = ["train_config"]
@@ -31,6 +36,12 @@ class DiarPackedContents(PackedContents):
     yaml_files = ["train_config"]
 
 
+class EnhS2TPackedContents(PackedContents):
+    # These names must be consistent with the argument of inference functions
+    files = ["enh_s2t_model_file", "lm_file"]
+    yaml_files = ["enh_s2t_train_config", "lm_train_config"]
+
+
 def add_arguments(parser: argparse.ArgumentParser, contents: Type[PackedContents]):
     parser.add_argument("--outpath", type=str, required=True)
     for key in contents.yaml_files:
@@ -47,9 +58,11 @@ def get_parser() -> argparse.ArgumentParser:
     # Create subparser for ASR
     for name, contents in [
         ("asr", ASRPackedContents),
+        ("st", STPackedContents),
         ("tts", TTSPackedContents),
         ("enh", EnhPackedContents),
         ("diar", DiarPackedContents),
+        ("enh_s2t", EnhS2TPackedContents),
     ]:
         parser_asr = subparsers.add_parser(
             name,
diff --git a/espnet2/bin/st_inference.py b/espnet2/bin/st_inference.py
new file mode 100755
index 00000000000..4cf9bc4d1a6
--- /dev/null
+++ b/espnet2/bin/st_inference.py
@@ -0,0 +1,559 @@
+#!/usr/bin/env python3
+import argparse
+import logging
+from pathlib import Path
+import sys
+from typing import Any
+from typing import Optional
+from typing import Sequence
+from typing import Tuple
+from typing import Union
+
+import numpy as np
+import torch
+from typeguard import check_argument_types
+from typeguard import check_return_type
+from typing import List
+
+from espnet.nets.batch_beam_search import BatchBeamSearch
+from espnet.nets.beam_search import BeamSearch
+from espnet.nets.beam_search import Hypothesis
+from espnet.nets.pytorch_backend.transformer.subsampling import TooShortUttError
+from espnet.nets.scorer_interface import BatchScorerInterface
+from espnet.nets.scorers.length_bonus import LengthBonus
+from espnet.utils.cli_utils import get_commandline_args
+from espnet2.fileio.datadir_writer import DatadirWriter
+from espnet2.tasks.enh_s2t import EnhS2TTask
+from espnet2.tasks.lm import LMTask
+from espnet2.tasks.st import STTask
+from espnet2.text.build_tokenizer import build_tokenizer
+from espnet2.text.token_id_converter import TokenIDConverter
+from espnet2.torch_utils.device_funcs import to_device
+from espnet2.torch_utils.set_all_random_seed import set_all_random_seed
+from espnet2.utils import config_argparse
+from espnet2.utils.types import str2bool
+from espnet2.utils.types import str2triple_str
+from espnet2.utils.types import str_or_none
+
+
+class Speech2Text:
+    """Speech2Text class
+
+    Examples:
+        >>> import soundfile
+        >>> speech2text = Speech2Text("st_config.yml", "st.pth")
+        >>> audio, rate = soundfile.read("speech.wav")
+        >>> speech2text(audio)
+        [(text, token, token_int, hypothesis object), ...]
+
+    """
+
+    def __init__(
+        self,
+        st_train_config: Union[Path, str] = None,
+        st_model_file: Union[Path, str] = None,
+        lm_train_config: Union[Path, str] = None,
+        lm_file: Union[Path, str] = None,
+        ngram_scorer: str = "full",
+        ngram_file: Union[Path, str] = None,
+        token_type: str = None,
+        bpemodel: str = None,
+        device: str = "cpu",
+        maxlenratio: float = 0.0,
+        minlenratio: float = 0.0,
+        batch_size: int = 1,
+        dtype: str = "float32",
+        beam_size: int = 20,
+        lm_weight: float = 1.0,
+        ngram_weight: float = 0.9,
+        penalty: float = 0.0,
+        nbest: int = 1,
+        enh_s2t_task: bool = False,
+    ):
+        assert check_argument_types()
+
+        task = STTask if not enh_s2t_task else EnhS2TTask
+
+        # 1. Build ST model
+        scorers = {}
+        st_model, st_train_args = task.build_model_from_file(
+            st_train_config, st_model_file, device
+        )
+        if enh_s2t_task:
+            st_model.inherite_attributes(
+                inherite_s2t_attrs=[
+                    "ctc",
+                    "decoder",
+                    "eos",
+                    "joint_network",
+                    "sos",
+                    "token_list",
+                    "use_transducer_decoder",
+                ]
+            )
+        st_model.to(dtype=getattr(torch, dtype)).eval()
+
+        decoder = st_model.decoder
+        token_list = st_model.token_list
+        scorers.update(
+            decoder=decoder,
+            length_bonus=LengthBonus(len(token_list)),
+        )
+
+        # 2. Build Language model
+        if lm_train_config is not None:
+            lm, lm_train_args = LMTask.build_model_from_file(
+                lm_train_config, lm_file, device
+            )
+            scorers["lm"] = lm.lm
+
+        # 3. Build ngram model
+        if ngram_file is not None:
+            if ngram_scorer == "full":
+                from espnet.nets.scorers.ngram import NgramFullScorer
+
+                ngram = NgramFullScorer(ngram_file, token_list)
+            else:
+                from espnet.nets.scorers.ngram import NgramPartScorer
+
+                ngram = NgramPartScorer(ngram_file, token_list)
+        else:
+            ngram = None
+        scorers["ngram"] = ngram
+
+        # 4. Build BeamSearch object
+        weights = dict(
+            decoder=1.0,
+            lm=lm_weight,
+            ngram=ngram_weight,
+            length_bonus=penalty,
+        )
+        beam_search = BeamSearch(
+            beam_size=beam_size,
+            weights=weights,
+            scorers=scorers,
+            sos=st_model.sos,
+            eos=st_model.eos,
+            vocab_size=len(token_list),
+            token_list=token_list,
+            pre_beam_score_key="full",
+        )
+        # TODO(karita): make all scorers batchfied
+        if batch_size == 1:
+            non_batch = [
+                k
+                for k, v in beam_search.full_scorers.items()
+                if not isinstance(v, BatchScorerInterface)
+            ]
+            if len(non_batch) == 0:
+                beam_search.__class__ = BatchBeamSearch
+                logging.info("BatchBeamSearch implementation is selected.")
+            else:
+                logging.warning(
+                    f"As non-batch scorers {non_batch} are found, "
+                    f"fall back to non-batch implementation."
+                )
+        beam_search.to(device=device, dtype=getattr(torch, dtype)).eval()
+        for scorer in scorers.values():
+            if isinstance(scorer, torch.nn.Module):
+                scorer.to(device=device, dtype=getattr(torch, dtype)).eval()
+        logging.info(f"Beam_search: {beam_search}")
+        logging.info(f"Decoding device={device}, dtype={dtype}")
+
+        # 4. [Optional] Build Text converter: e.g. bpe-sym -> Text
+        if token_type is None:
+            token_type = st_train_args.token_type
+        if bpemodel is None:
+            bpemodel = st_train_args.bpemodel
+
+        if token_type is None:
+            tokenizer = None
+        elif token_type == "bpe":
+            if bpemodel is not None:
+                tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel)
+            else:
+                tokenizer = None
+        else:
+            tokenizer = build_tokenizer(token_type=token_type)
+        converter = TokenIDConverter(token_list=token_list)
+        logging.info(f"Text tokenizer: {tokenizer}")
+
+        self.st_model = st_model
+        self.st_train_args = st_train_args
+        self.converter = converter
+        self.tokenizer = tokenizer
+        self.beam_search = beam_search
+        self.maxlenratio = maxlenratio
+        self.minlenratio = minlenratio
+        self.device = device
+        self.dtype = dtype
+        self.nbest = nbest
+
+    @torch.no_grad()
+    def __call__(
+        self, speech: Union[torch.Tensor, np.ndarray]
+    ) -> List[Tuple[Optional[str], List[str], List[int], Hypothesis]]:
+        """Inference
+
+        Args:
+            data: Input speech data
+        Returns:
+            text, token, token_int, hyp
+
+        """
+        assert check_argument_types()
+
+        # Input as audio signal
+        if isinstance(speech, np.ndarray):
+            speech = torch.tensor(speech)
+
+        # data: (Nsamples,) -> (1, Nsamples)
+        speech = speech.unsqueeze(0).to(getattr(torch, self.dtype))
+        # lengths: (1,)
+        lengths = speech.new_full([1], dtype=torch.long, fill_value=speech.size(1))
+        batch = {"speech": speech, "speech_lengths": lengths}
+
+        # a. To device
+        batch = to_device(batch, device=self.device)
+
+        # b. Forward Encoder
+        enc, _ = self.st_model.encode(**batch)
+        assert len(enc) == 1, len(enc)
+
+        # c. Passed the encoder result and the beam search
+        nbest_hyps = self.beam_search(
+            x=enc[0], maxlenratio=self.maxlenratio, minlenratio=self.minlenratio
+        )
+        nbest_hyps = nbest_hyps[: self.nbest]
+
+        results = []
+        for hyp in nbest_hyps:
+            assert isinstance(hyp, Hypothesis), type(hyp)
+
+            # remove sos/eos and get results
+            token_int = hyp.yseq[1:-1].tolist()
+
+            # remove blank symbol id, which is assumed to be 0
+            token_int = list(filter(lambda x: x != 0, token_int))
+
+            # Change integer-ids to tokens
+            token = self.converter.ids2tokens(token_int)
+
+            if self.tokenizer is not None:
+                text = self.tokenizer.tokens2text(token)
+            else:
+                text = None
+            results.append((text, token, token_int, hyp))
+
+        assert check_return_type(results)
+        return results
+
+    @staticmethod
+    def from_pretrained(
+        model_tag: Optional[str] = None,
+        **kwargs: Optional[Any],
+    ):
+        """Build Speech2Text instance from the pretrained model.
+
+        Args:
+            model_tag (Optional[str]): Model tag of the pretrained models.
+                Currently, the tags of espnet_model_zoo are supported.
+        Returns:
+            Speech2Text: Speech2Text instance.
+
+        """
+        if model_tag is not None:
+            try:
+                from espnet_model_zoo.downloader import ModelDownloader
+
+            except ImportError:
+                logging.error(
+                    "`espnet_model_zoo` is not installed. "
+                    "Please install via `pip install -U espnet_model_zoo`."
+                )
+                raise
+            d = ModelDownloader()
+            kwargs.update(**d.download_and_unpack(model_tag))
+
+        return Speech2Text(**kwargs)
+
+
+def inference(
+    output_dir: str,
+    maxlenratio: float,
+    minlenratio: float,
+    batch_size: int,
+    dtype: str,
+    beam_size: int,
+    ngpu: int,
+    seed: int,
+    lm_weight: float,
+    ngram_weight: float,
+    penalty: float,
+    nbest: int,
+    num_workers: int,
+    log_level: Union[int, str],
+    data_path_and_name_and_type: Sequence[Tuple[str, str, str]],
+    key_file: Optional[str],
+    st_train_config: Optional[str],
+    st_model_file: Optional[str],
+    lm_train_config: Optional[str],
+    lm_file: Optional[str],
+    word_lm_train_config: Optional[str],
+    word_lm_file: Optional[str],
+    ngram_file: Optional[str],
+    model_tag: Optional[str],
+    token_type: Optional[str],
+    bpemodel: Optional[str],
+    allow_variable_data_keys: bool,
+    enh_s2t_task: bool,
+):
+    assert check_argument_types()
+    if batch_size > 1:
+        raise NotImplementedError("batch decoding is not implemented")
+    if word_lm_train_config is not None:
+        raise NotImplementedError("Word LM is not implemented")
+    if ngpu > 1:
+        raise NotImplementedError("only single GPU decoding is supported")
+
+    logging.basicConfig(
+        level=log_level,
+        format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+    )
+
+    if ngpu >= 1:
+        device = "cuda"
+    else:
+        device = "cpu"
+
+    # 1. Set random-seed
+    set_all_random_seed(seed)
+
+    # 2. Build speech2text
+    speech2text_kwargs = dict(
+        st_train_config=st_train_config,
+        st_model_file=st_model_file,
+        lm_train_config=lm_train_config,
+        lm_file=lm_file,
+        ngram_file=ngram_file,
+        token_type=token_type,
+        bpemodel=bpemodel,
+        device=device,
+        maxlenratio=maxlenratio,
+        minlenratio=minlenratio,
+        dtype=dtype,
+        beam_size=beam_size,
+        lm_weight=lm_weight,
+        ngram_weight=ngram_weight,
+        penalty=penalty,
+        nbest=nbest,
+        enh_s2t_task=enh_s2t_task,
+    )
+    speech2text = Speech2Text.from_pretrained(
+        model_tag=model_tag,
+        **speech2text_kwargs,
+    )
+
+    # 3. Build data-iterator
+    loader = STTask.build_streaming_iterator(
+        data_path_and_name_and_type,
+        dtype=dtype,
+        batch_size=batch_size,
+        key_file=key_file,
+        num_workers=num_workers,
+        preprocess_fn=STTask.build_preprocess_fn(speech2text.st_train_args, False),
+        collate_fn=STTask.build_collate_fn(speech2text.st_train_args, False),
+        allow_variable_data_keys=allow_variable_data_keys,
+        inference=True,
+    )
+
+    # 7 .Start for-loop
+    # FIXME(kamo): The output format should be discussed about
+    with DatadirWriter(output_dir) as writer:
+        for keys, batch in loader:
+            assert isinstance(batch, dict), type(batch)
+            assert all(isinstance(s, str) for s in keys), keys
+            _bs = len(next(iter(batch.values())))
+            assert len(keys) == _bs, f"{len(keys)} != {_bs}"
+            batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}
+
+            # N-best list of (text, token, token_int, hyp_object)
+            try:
+                results = speech2text(**batch)
+            except TooShortUttError as e:
+                logging.warning(f"Utterance {keys} {e}")
+                hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
+                results = [[" ", ["<space>"], [2], hyp]] * nbest
+
+            # Only supporting batch_size==1
+            key = keys[0]
+            for n, (text, token, token_int, hyp) in zip(range(1, nbest + 1), results):
+                # Create a directory: outdir/{n}best_recog
+                ibest_writer = writer[f"{n}best_recog"]
+
+                # Write the result to each file
+                ibest_writer["token"][key] = " ".join(token)
+                ibest_writer["token_int"][key] = " ".join(map(str, token_int))
+                ibest_writer["score"][key] = str(hyp.score)
+
+                if text is not None:
+                    ibest_writer["text"][key] = text
+
+
+def get_parser():
+    parser = config_argparse.ArgumentParser(
+        description="ST Decoding",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    # Note(kamo): Use '_' instead of '-' as separator.
+    # '-' is confusing if written in yaml.
+    parser.add_argument(
+        "--log_level",
+        type=lambda x: x.upper(),
+        default="INFO",
+        choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
+        help="The verbose level of logging",
+    )
+
+    parser.add_argument("--output_dir", type=str, required=True)
+    parser.add_argument(
+        "--ngpu",
+        type=int,
+        default=0,
+        help="The number of gpus. 0 indicates CPU mode",
+    )
+    parser.add_argument("--seed", type=int, default=0, help="Random seed")
+    parser.add_argument(
+        "--dtype",
+        default="float32",
+        choices=["float16", "float32", "float64"],
+        help="Data type",
+    )
+    parser.add_argument(
+        "--num_workers",
+        type=int,
+        default=1,
+        help="The number of workers used for DataLoader",
+    )
+
+    group = parser.add_argument_group("Input data related")
+    group.add_argument(
+        "--data_path_and_name_and_type",
+        type=str2triple_str,
+        required=True,
+        action="append",
+    )
+    group.add_argument("--key_file", type=str_or_none)
+    group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
+
+    group = parser.add_argument_group("The model configuration related")
+    group.add_argument(
+        "--st_train_config",
+        type=str,
+        help="ST training configuration",
+    )
+    group.add_argument(
+        "--st_model_file",
+        type=str,
+        help="ST model parameter file",
+    )
+    group.add_argument(
+        "--lm_train_config",
+        type=str,
+        help="LM training configuration",
+    )
+    group.add_argument(
+        "--lm_file",
+        type=str,
+        help="LM parameter file",
+    )
+    group.add_argument(
+        "--word_lm_train_config",
+        type=str,
+        help="Word LM training configuration",
+    )
+    group.add_argument(
+        "--word_lm_file",
+        type=str,
+        help="Word LM parameter file",
+    )
+    group.add_argument(
+        "--ngram_file",
+        type=str,
+        help="N-gram parameter file",
+    )
+    group.add_argument(
+        "--model_tag",
+        type=str,
+        help="Pretrained model tag. If specify this option, *_train_config and "
+        "*_file will be overwritten",
+    )
+    group.add_argument(
+        "--enh_s2t_task",
+        type=str2bool,
+        default=False,
+        help="enhancement and asr joint model",
+    )
+
+    group = parser.add_argument_group("Beam-search related")
+    group.add_argument(
+        "--batch_size",
+        type=int,
+        default=1,
+        help="The batch size for inference",
+    )
+    group.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
+    group.add_argument("--beam_size", type=int, default=20, help="Beam size")
+    group.add_argument("--penalty", type=float, default=0.0, help="Insertion penalty")
+    group.add_argument(
+        "--maxlenratio",
+        type=float,
+        default=0.0,
+        help="Input length ratio to obtain max output length. "
+        "If maxlenratio=0.0 (default), it uses a end-detect "
+        "function "
+        "to automatically find maximum hypothesis lengths."
+        "If maxlenratio<0.0, its absolute value is interpreted"
+        "as a constant max output length",
+    )
+    group.add_argument(
+        "--minlenratio",
+        type=float,
+        default=0.0,
+        help="Input length ratio to obtain min output length",
+    )
+    group.add_argument("--lm_weight", type=float, default=1.0, help="RNNLM weight")
+    group.add_argument("--ngram_weight", type=float, default=0.9, help="ngram weight")
+
+    group = parser.add_argument_group("Text converter related")
+    group.add_argument(
+        "--token_type",
+        type=str_or_none,
+        default=None,
+        choices=["char", "bpe", None],
+        help="The token type for ST model. "
+        "If not given, refers from the training args",
+    )
+    group.add_argument(
+        "--bpemodel",
+        type=str_or_none,
+        default=None,
+        help="The model path of sentencepiece. "
+        "If not given, refers from the training args",
+    )
+
+    return parser
+
+
+def main(cmd=None):
+    print(get_commandline_args(), file=sys.stderr)
+    parser = get_parser()
+    args = parser.parse_args(cmd)
+    kwargs = vars(args)
+    kwargs.pop("config", None)
+    inference(**kwargs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/espnet2/bin/st_inference_streaming.py b/espnet2/bin/st_inference_streaming.py
new file mode 100644
index 00000000000..8be428f2441
--- /dev/null
+++ b/espnet2/bin/st_inference_streaming.py
@@ -0,0 +1,616 @@
+#!/usr/bin/env python3
+import argparse
+from espnet.nets.batch_beam_search_online import BatchBeamSearchOnline
+from espnet.nets.beam_search import Hypothesis
+from espnet.nets.pytorch_backend.transformer.subsampling import TooShortUttError
+from espnet.nets.scorer_interface import BatchScorerInterface
+from espnet.nets.scorers.length_bonus import LengthBonus
+from espnet.utils.cli_utils import get_commandline_args
+from espnet2.asr.encoder.contextual_block_transformer_encoder import (
+    ContextualBlockTransformerEncoder,  # noqa: H301
+)
+from espnet2.asr.encoder.contextual_block_conformer_encoder import (
+    ContextualBlockConformerEncoder,  # noqa: H301
+)
+from espnet2.fileio.datadir_writer import DatadirWriter
+from espnet2.tasks.lm import LMTask
+from espnet2.tasks.st import STTask
+from espnet2.text.build_tokenizer import build_tokenizer
+from espnet2.text.token_id_converter import TokenIDConverter
+from espnet2.torch_utils.device_funcs import to_device
+from espnet2.torch_utils.set_all_random_seed import set_all_random_seed
+from espnet2.utils import config_argparse
+from espnet2.utils.types import str2bool
+from espnet2.utils.types import str2triple_str
+from espnet2.utils.types import str_or_none
+import logging
+import math
+import numpy as np
+from pathlib import Path
+import sys
+import torch
+from typeguard import check_argument_types
+from typeguard import check_return_type
+from typing import List
+from typing import Optional
+from typing import Sequence
+from typing import Tuple
+from typing import Union
+
+
+class Speech2TextStreaming:
+    """Speech2TextStreaming class
+
+    Details in "Streaming Transformer ASR with Blockwise Synchronous Beam Search"
+    (https://arxiv.org/abs/2006.14941)
+
+    Examples:
+        >>> import soundfile
+        >>> speech2text = Speech2TextStreaming("asr_config.yml", "asr.pth")
+        >>> audio, rate = soundfile.read("speech.wav")
+        >>> speech2text(audio)
+        [(text, token, token_int, hypothesis object), ...]
+
+    """
+
+    def __init__(
+        self,
+        st_train_config: Union[Path, str],
+        st_model_file: Union[Path, str] = None,
+        lm_train_config: Union[Path, str] = None,
+        lm_file: Union[Path, str] = None,
+        token_type: str = None,
+        bpemodel: str = None,
+        device: str = "cpu",
+        maxlenratio: float = 0.0,
+        minlenratio: float = 0.0,
+        batch_size: int = 1,
+        dtype: str = "float32",
+        beam_size: int = 20,
+        lm_weight: float = 1.0,
+        penalty: float = 0.0,
+        nbest: int = 1,
+        disable_repetition_detection=False,
+        decoder_text_length_limit=0,
+        encoded_feat_length_limit=0,
+    ):
+        assert check_argument_types()
+
+        # 1. Build ST model
+        scorers = {}
+        st_model, st_train_args = STTask.build_model_from_file(
+            st_train_config, st_model_file, device
+        )
+        st_model.to(dtype=getattr(torch, dtype)).eval()
+
+        assert isinstance(
+            st_model.encoder, ContextualBlockTransformerEncoder
+        ) or isinstance(st_model.encoder, ContextualBlockConformerEncoder)
+
+        decoder = st_model.decoder
+        token_list = st_model.token_list
+        scorers.update(
+            decoder=decoder,
+            length_bonus=LengthBonus(len(token_list)),
+        )
+
+        # 2. Build Language model
+        if lm_train_config is not None:
+            lm, lm_train_args = LMTask.build_model_from_file(
+                lm_train_config, lm_file, device
+            )
+            scorers["lm"] = lm.lm
+
+        # 3. Build BeamSearch object
+        weights = dict(
+            decoder=1.0,
+            lm=lm_weight,
+            length_bonus=penalty,
+        )
+
+        assert "encoder_conf" in st_train_args
+        assert "look_ahead" in st_train_args.encoder_conf
+        assert "hop_size" in st_train_args.encoder_conf
+        assert "block_size" in st_train_args.encoder_conf
+        # look_ahead = st_train_args.encoder_conf['look_ahead']
+        # hop_size   = st_train_args.encoder_conf['hop_size']
+        # block_size = st_train_args.encoder_conf['block_size']
+
+        assert batch_size == 1
+
+        beam_search = BatchBeamSearchOnline(
+            beam_size=beam_size,
+            weights=weights,
+            scorers=scorers,
+            sos=st_model.sos,
+            eos=st_model.eos,
+            vocab_size=len(token_list),
+            token_list=token_list,
+            pre_beam_score_key="full",
+            disable_repetition_detection=disable_repetition_detection,
+            decoder_text_length_limit=decoder_text_length_limit,
+            encoded_feat_length_limit=encoded_feat_length_limit,
+        )
+
+        non_batch = [
+            k
+            for k, v in beam_search.full_scorers.items()
+            if not isinstance(v, BatchScorerInterface)
+        ]
+        assert len(non_batch) == 0
+
+        # TODO(karita): make all scorers batchfied
+        logging.info("BatchBeamSearchOnline implementation is selected.")
+
+        beam_search.to(device=device, dtype=getattr(torch, dtype)).eval()
+        for scorer in scorers.values():
+            if isinstance(scorer, torch.nn.Module):
+                scorer.to(device=device, dtype=getattr(torch, dtype)).eval()
+        logging.info(f"Beam_search: {beam_search}")
+        logging.info(f"Decoding device={device}, dtype={dtype}")
+
+        # 4. [Optional] Build Text converter: e.g. bpe-sym -> Text
+        if token_type is None:
+            token_type = st_train_args.token_type
+        if bpemodel is None:
+            bpemodel = st_train_args.bpemodel
+
+        if token_type is None:
+            tokenizer = None
+        elif token_type == "bpe":
+            if bpemodel is not None:
+                tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel)
+            else:
+                tokenizer = None
+        else:
+            tokenizer = build_tokenizer(token_type=token_type)
+        converter = TokenIDConverter(token_list=token_list)
+        logging.info(f"Text tokenizer: {tokenizer}")
+
+        self.st_model = st_model
+        self.st_train_args = st_train_args
+        self.converter = converter
+        self.tokenizer = tokenizer
+        self.beam_search = beam_search
+        self.maxlenratio = maxlenratio
+        self.minlenratio = minlenratio
+        self.device = device
+        self.dtype = dtype
+        self.nbest = nbest
+        if "n_fft" in st_train_args.frontend_conf:
+            self.n_fft = st_train_args.frontend_conf["n_fft"]
+        else:
+            self.n_fft = 512
+        if "hop_length" in st_train_args.frontend_conf:
+            self.hop_length = st_train_args.frontend_conf["hop_length"]
+        else:
+            self.hop_length = 128
+        if (
+            "win_length" in st_train_args.frontend_conf
+            and st_train_args.frontend_conf["win_length"] is not None
+        ):
+            self.win_length = st_train_args.frontend_conf["win_length"]
+        else:
+            self.win_length = self.n_fft
+
+        self.reset()
+
+    def reset(self):
+        self.frontend_states = None
+        self.encoder_states = None
+        self.beam_search.reset()
+
+    def apply_frontend(
+        self, speech: torch.Tensor, prev_states=None, is_final: bool = False
+    ):
+        if prev_states is not None:
+            buf = prev_states["waveform_buffer"]
+            speech = torch.cat([buf, speech], dim=0)
+
+        if is_final:
+            speech_to_process = speech
+            waveform_buffer = None
+        else:
+            n_frames = (
+                speech.size(0) - (self.win_length - self.hop_length)
+            ) // self.hop_length
+            n_residual = (
+                speech.size(0) - (self.win_length - self.hop_length)
+            ) % self.hop_length
+            speech_to_process = speech.narrow(
+                0, 0, (self.win_length - self.hop_length) + n_frames * self.hop_length
+            )
+            waveform_buffer = speech.narrow(
+                0,
+                speech.size(0) - (self.win_length - self.hop_length) - n_residual,
+                (self.win_length - self.hop_length) + n_residual,
+            ).clone()
+
+        # data: (Nsamples,) -> (1, Nsamples)
+        speech_to_process = speech_to_process.unsqueeze(0).to(
+            getattr(torch, self.dtype)
+        )
+        lengths = speech_to_process.new_full(
+            [1], dtype=torch.long, fill_value=speech_to_process.size(1)
+        )
+        batch = {"speech": speech_to_process, "speech_lengths": lengths}
+
+        # lenghts: (1,)
+        # a. To device
+        batch = to_device(batch, device=self.device)
+
+        feats, feats_lengths = self.st_model._extract_feats(**batch)
+        if self.st_model.normalize is not None:
+            feats, feats_lengths = self.st_model.normalize(feats, feats_lengths)
+
+        # Trimming
+        if is_final:
+            if prev_states is None:
+                pass
+            else:
+                feats = feats.narrow(
+                    1,
+                    math.ceil(math.ceil(self.win_length / self.hop_length) / 2),
+                    feats.size(1)
+                    - math.ceil(math.ceil(self.win_length / self.hop_length) / 2),
+                )
+        else:
+            if prev_states is None:
+                feats = feats.narrow(
+                    1,
+                    0,
+                    feats.size(1)
+                    - math.ceil(math.ceil(self.win_length / self.hop_length) / 2),
+                )
+            else:
+                feats = feats.narrow(
+                    1,
+                    math.ceil(math.ceil(self.win_length / self.hop_length) / 2),
+                    feats.size(1)
+                    - 2 * math.ceil(math.ceil(self.win_length / self.hop_length) / 2),
+                )
+
+        feats_lengths = feats.new_full([1], dtype=torch.long, fill_value=feats.size(1))
+
+        if is_final:
+            next_states = None
+        else:
+            next_states = {"waveform_buffer": waveform_buffer}
+        return feats, feats_lengths, next_states
+
+    @torch.no_grad()
+    def __call__(
+        self, speech: Union[torch.Tensor, np.ndarray], is_final: bool = True
+    ) -> List[Tuple[Optional[str], List[str], List[int], Hypothesis]]:
+        """Inference
+
+        Args:
+            data: Input speech data
+        Returns:
+            text, token, token_int, hyp
+
+        """
+        assert check_argument_types()
+
+        # Input as audio signal
+        if isinstance(speech, np.ndarray):
+            speech = torch.tensor(speech)
+
+        feats, feats_lengths, self.frontend_states = self.apply_frontend(
+            speech, self.frontend_states, is_final=is_final
+        )
+        enc, _, self.encoder_states = self.st_model.encoder(
+            feats,
+            feats_lengths,
+            self.encoder_states,
+            is_final=is_final,
+            infer_mode=True,
+        )
+        nbest_hyps = self.beam_search(
+            x=enc[0],
+            maxlenratio=self.maxlenratio,
+            minlenratio=self.minlenratio,
+            is_final=is_final,
+        )
+
+        ret = self.assemble_hyps(nbest_hyps)
+        if is_final:
+            self.reset()
+        return ret
+
+    def assemble_hyps(self, hyps):
+        nbest_hyps = hyps[: self.nbest]
+        results = []
+        for hyp in nbest_hyps:
+            assert isinstance(hyp, Hypothesis), type(hyp)
+
+            # remove sos/eos and get results
+            token_int = hyp.yseq[1:-1].tolist()
+
+            # remove blank symbol id, which is assumed to be 0
+            token_int = list(filter(lambda x: x != 0, token_int))
+
+            # Change integer-ids to tokens
+            token = self.converter.ids2tokens(token_int)
+
+            if self.tokenizer is not None:
+                text = self.tokenizer.tokens2text(token)
+            else:
+                text = None
+            results.append((text, token, token_int, hyp))
+
+        assert check_return_type(results)
+        return results
+
+
+def inference(
+    output_dir: str,
+    maxlenratio: float,
+    minlenratio: float,
+    batch_size: int,
+    dtype: str,
+    beam_size: int,
+    ngpu: int,
+    seed: int,
+    lm_weight: float,
+    penalty: float,
+    nbest: int,
+    num_workers: int,
+    log_level: Union[int, str],
+    data_path_and_name_and_type: Sequence[Tuple[str, str, str]],
+    key_file: Optional[str],
+    st_train_config: str,
+    st_model_file: str,
+    lm_train_config: Optional[str],
+    lm_file: Optional[str],
+    word_lm_train_config: Optional[str],
+    word_lm_file: Optional[str],
+    token_type: Optional[str],
+    bpemodel: Optional[str],
+    allow_variable_data_keys: bool,
+    sim_chunk_length: int,
+    disable_repetition_detection: bool,
+    encoded_feat_length_limit: int,
+    decoder_text_length_limit: int,
+):
+    assert check_argument_types()
+    if batch_size > 1:
+        raise NotImplementedError("batch decoding is not implemented")
+    if word_lm_train_config is not None:
+        raise NotImplementedError("Word LM is not implemented")
+    if ngpu > 1:
+        raise NotImplementedError("only single GPU decoding is supported")
+
+    logging.basicConfig(
+        level=log_level,
+        format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+    )
+
+    if ngpu >= 1:
+        device = "cuda"
+    else:
+        device = "cpu"
+
+    # 1. Set random-seed
+    set_all_random_seed(seed)
+
+    # 2. Build speech2text
+    speech2text = Speech2TextStreaming(
+        st_train_config=st_train_config,
+        st_model_file=st_model_file,
+        lm_train_config=lm_train_config,
+        lm_file=lm_file,
+        token_type=token_type,
+        bpemodel=bpemodel,
+        device=device,
+        maxlenratio=maxlenratio,
+        minlenratio=minlenratio,
+        dtype=dtype,
+        beam_size=beam_size,
+        lm_weight=lm_weight,
+        penalty=penalty,
+        nbest=nbest,
+        disable_repetition_detection=disable_repetition_detection,
+        decoder_text_length_limit=decoder_text_length_limit,
+        encoded_feat_length_limit=encoded_feat_length_limit,
+    )
+
+    # 3. Build data-iterator
+    loader = STTask.build_streaming_iterator(
+        data_path_and_name_and_type,
+        dtype=dtype,
+        batch_size=batch_size,
+        key_file=key_file,
+        num_workers=num_workers,
+        preprocess_fn=STTask.build_preprocess_fn(speech2text.st_train_args, False),
+        collate_fn=STTask.build_collate_fn(speech2text.st_train_args, False),
+        allow_variable_data_keys=allow_variable_data_keys,
+        inference=True,
+    )
+
+    # 7 .Start for-loop
+    # FIXME(kamo): The output format should be discussed about
+    with DatadirWriter(output_dir) as writer:
+        for keys, batch in loader:
+            assert isinstance(batch, dict), type(batch)
+            assert all(isinstance(s, str) for s in keys), keys
+            _bs = len(next(iter(batch.values())))
+            assert len(keys) == _bs, f"{len(keys)} != {_bs}"
+            batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}
+            assert len(batch.keys()) == 1
+
+            try:
+                if sim_chunk_length == 0:
+                    # N-best list of (text, token, token_int, hyp_object)
+                    results = speech2text(**batch)
+                else:
+                    speech = batch["speech"]
+                    if (len(speech) // sim_chunk_length) > 1:
+                        for i in range(len(speech) // sim_chunk_length):
+                            speech2text(
+                                speech=speech[
+                                    i * sim_chunk_length : (i + 1) * sim_chunk_length
+                                ],
+                                is_final=False,
+                            )
+                        results = speech2text(
+                            speech[(i + 1) * sim_chunk_length : len(speech)],
+                            is_final=True,
+                        )
+                    else:
+                        results = speech2text(**batch)
+
+            except TooShortUttError as e:
+                logging.warning(f"Utterance {keys} {e}")
+                hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
+                results = [[" ", ["<space>"], [2], hyp]] * nbest
+
+            # Only supporting batch_size==1
+            key = keys[0]
+            for n, (text, token, token_int, hyp) in zip(range(1, nbest + 1), results):
+                # Create a directory: outdir/{n}best_recog
+                ibest_writer = writer[f"{n}best_recog"]
+
+                # Write the result to each file
+                ibest_writer["token"][key] = " ".join(token)
+                ibest_writer["token_int"][key] = " ".join(map(str, token_int))
+                ibest_writer["score"][key] = str(hyp.score)
+
+                if text is not None:
+                    ibest_writer["text"][key] = text
+
+
+def get_parser():
+    parser = config_argparse.ArgumentParser(
+        description="ST Decoding",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    # Note(kamo): Use '_' instead of '-' as separator.
+    # '-' is confusing if written in yaml.
+    parser.add_argument(
+        "--log_level",
+        type=lambda x: x.upper(),
+        default="INFO",
+        choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
+        help="The verbose level of logging",
+    )
+
+    parser.add_argument("--output_dir", type=str, required=True)
+    parser.add_argument(
+        "--ngpu",
+        type=int,
+        default=0,
+        help="The number of gpus. 0 indicates CPU mode",
+    )
+    parser.add_argument("--seed", type=int, default=0, help="Random seed")
+    parser.add_argument(
+        "--dtype",
+        default="float32",
+        choices=["float16", "float32", "float64"],
+        help="Data type",
+    )
+    parser.add_argument(
+        "--num_workers",
+        type=int,
+        default=1,
+        help="The number of workers used for DataLoader",
+    )
+
+    group = parser.add_argument_group("Input data related")
+    group.add_argument(
+        "--data_path_and_name_and_type",
+        type=str2triple_str,
+        required=True,
+        action="append",
+    )
+    group.add_argument("--key_file", type=str_or_none)
+    group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
+    group.add_argument(
+        "--sim_chunk_length",
+        type=int,
+        default=0,
+        help="The length of one chunk, to which speech will be "
+        "divided for evalution of streaming processing.",
+    )
+
+    group = parser.add_argument_group("The model configuration related")
+    group.add_argument("--st_train_config", type=str, required=True)
+    group.add_argument("--st_model_file", type=str, required=True)
+    group.add_argument("--lm_train_config", type=str)
+    group.add_argument("--lm_file", type=str)
+    group.add_argument("--word_lm_train_config", type=str)
+    group.add_argument("--word_lm_file", type=str)
+
+    group = parser.add_argument_group("Beam-search related")
+    group.add_argument(
+        "--batch_size",
+        type=int,
+        default=1,
+        help="The batch size for inference",
+    )
+    group.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
+    group.add_argument("--beam_size", type=int, default=20, help="Beam size")
+    group.add_argument("--penalty", type=float, default=0.0, help="Insertion penalty")
+    group.add_argument(
+        "--maxlenratio",
+        type=float,
+        default=0.0,
+        help="Input length ratio to obtain max output length. "
+        "If maxlenratio=0.0 (default), it uses a end-detect "
+        "function "
+        "to automatically find maximum hypothesis lengths",
+    )
+    group.add_argument(
+        "--minlenratio",
+        type=float,
+        default=0.0,
+        help="Input length ratio to obtain min output length",
+    )
+    group.add_argument("--lm_weight", type=float, default=1.0, help="RNNLM weight")
+    group.add_argument("--disable_repetition_detection", type=str2bool, default=False)
+
+    group.add_argument(
+        "--encoded_feat_length_limit",
+        type=int,
+        default=0,
+        help="Limit the lengths of the encoded feature" "to input to the decoder.",
+    )
+    group.add_argument(
+        "--decoder_text_length_limit",
+        type=int,
+        default=0,
+        help="Limit the lengths of the text" "to input to the decoder.",
+    )
+
+    group = parser.add_argument_group("Text converter related")
+    group.add_argument(
+        "--token_type",
+        type=str_or_none,
+        default=None,
+        choices=["char", "bpe", None],
+        help="The token type for ST model. "
+        "If not given, refers from the training args",
+    )
+    group.add_argument(
+        "--bpemodel",
+        type=str_or_none,
+        default=None,
+        help="The model path of sentencepiece. "
+        "If not given, refers from the training args",
+    )
+
+    return parser
+
+
+def main(cmd=None):
+    print(get_commandline_args(), file=sys.stderr)
+    parser = get_parser()
+    args = parser.parse_args(cmd)
+    kwargs = vars(args)
+    kwargs.pop("config", None)
+    inference(**kwargs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/espnet2/bin/st_train.py b/espnet2/bin/st_train.py
new file mode 100755
index 00000000000..5927fa8ec5f
--- /dev/null
+++ b/espnet2/bin/st_train.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python3
+from espnet2.tasks.st import STTask
+
+
+def get_parser():
+    parser = STTask.get_parser()
+    return parser
+
+
+def main(cmd=None):
+    r"""ST training.
+
+    Example:
+
+        % python st_train.py st --print_config --optim adadelta \
+                > conf/train_st.yaml
+        % python st_train.py --config conf/train_st.yaml
+    """
+    STTask.main(cmd=cmd)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/espnet2/bin/tts_inference.py b/espnet2/bin/tts_inference.py
index 338ce8a016b..683074d2eb0 100755
--- a/espnet2/bin/tts_inference.py
+++ b/espnet2/bin/tts_inference.py
@@ -92,6 +92,7 @@ def __init__(
         device: str = "cpu",
         seed: int = 777,
         always_fix_seed: bool = False,
+        prefer_normalized_feats: bool = False,
     ):
         """Initialize Text2Speech module."""
         assert check_argument_types()
@@ -114,6 +115,7 @@ def __init__(
         self.seed = seed
         self.always_fix_seed = always_fix_seed
         self.vocoder = None
+        self.prefer_normalized_feats = prefer_normalized_feats
         if self.tts.require_vocoder:
             vocoder = TTSTask.build_vocoder_from_file(
                 vocoder_config, vocoder_file, model, device
@@ -209,10 +211,13 @@ def __call__(
 
         # apply vocoder (mel-to-wav)
         if self.vocoder is not None:
-            if output_dict.get("feat_gen_denorm") is not None:
-                input_feat = output_dict["feat_gen_denorm"]
-            else:
+            if (
+                self.prefer_normalized_feats
+                or output_dict.get("feat_gen_denorm") is None
+            ):
                 input_feat = output_dict["feat_gen"]
+            else:
+                input_feat = output_dict["feat_gen_denorm"]
             wav = self.vocoder(input_feat)
             output_dict.update(wav=wav)
 
diff --git a/espnet2/diar/espnet_model.py b/espnet2/diar/espnet_model.py
index 15c34ce93e4..1e1d10af15e 100644
--- a/espnet2/diar/espnet_model.py
+++ b/espnet2/diar/espnet_model.py
@@ -78,6 +78,7 @@ def forward(
         speech_lengths: torch.Tensor = None,
         spk_labels: torch.Tensor = None,
         spk_labels_lengths: torch.Tensor = None,
+        **kwargs,
     ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
         """Frontend + Encoder + Decoder + Calc loss
 
@@ -89,6 +90,7 @@ def forward(
                                      see in
                                      espnet2/iterators/chunk_iter_factory.py
             spk_labels: (Batch, )
+            kwargs: "utt_id" is among the input.
         """
         assert speech.shape[0] == spk_labels.shape[0], (speech.shape, spk_labels.shape)
         batch_size = speech.shape[0]
@@ -125,6 +127,14 @@ def forward(
             spk_labels, spk_labels_lengths
         )
 
+        # If encoder uses conv* as input_layer (i.e., subsampling),
+        # the sequence length of 'pred' might be slighly less than the
+        # length of 'spk_labels'. Here we force them to be equal.
+        length_diff_tolerance = 2
+        length_diff = spk_labels.shape[1] - pred.shape[1]
+        if length_diff > 0 and length_diff <= length_diff_tolerance:
+            spk_labels = spk_labels[:, 0 : pred.shape[1], :]
+
         if self.attractor is None:
             loss_pit, loss_att = None, None
             loss, perm_idx, perm_list, label_perm = self.pit_loss(
@@ -183,6 +193,7 @@ def collect_feats(
         speech_lengths: torch.Tensor,
         spk_labels: torch.Tensor = None,
         spk_labels_lengths: torch.Tensor = None,
+        **kwargs,
     ) -> Dict[str, torch.Tensor]:
         feats, feats_lengths = self._extract_feats(speech, speech_lengths)
         return {"feats": feats, "feats_lengths": feats_lengths}
diff --git a/espnet2/enh/encoder/conv_encoder.py b/espnet2/enh/encoder/conv_encoder.py
index c46f31323e5..c70ccfc164b 100644
--- a/espnet2/enh/encoder/conv_encoder.py
+++ b/espnet2/enh/encoder/conv_encoder.py
@@ -34,7 +34,7 @@ def forward(self, input: torch.Tensor, ilens: torch.Tensor):
         Returns:
             feature (torch.Tensor): mixed feature after encoder [Batch, flens, channel]
         """
-        assert input.dim() == 2, "Currently only support single channle input"
+        assert input.dim() == 2, "Currently only support single channel input"
 
         input = torch.unsqueeze(input, 1)
 
diff --git a/espnet2/enh/espnet_enh_s2t_model.py b/espnet2/enh/espnet_enh_s2t_model.py
new file mode 100644
index 00000000000..51746f9fbc1
--- /dev/null
+++ b/espnet2/enh/espnet_enh_s2t_model.py
@@ -0,0 +1,274 @@
+from contextlib import contextmanager
+from distutils.version import LooseVersion
+import logging
+import random
+from typing import Dict
+from typing import List
+from typing import Tuple
+from typing import Union
+
+import torch
+from typeguard import check_argument_types
+
+from espnet2.asr.espnet_model import ESPnetASRModel
+from espnet2.enh.espnet_model import ESPnetEnhancementModel
+from espnet2.st.espnet_model import ESPnetSTModel
+from espnet2.torch_utils.device_funcs import force_gatherable
+from espnet2.train.abs_espnet_model import AbsESPnetModel
+
+if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"):
+    from torch.cuda.amp import autocast
+else:
+    # Nothing to do if torch<1.6.0
+    @contextmanager
+    def autocast(enabled=True):
+        yield
+
+
+class ESPnetEnhS2TModel(AbsESPnetModel):
+    """Joint model Enhancement and Speech to Text."""
+
+    def __init__(
+        self,
+        enh_model: ESPnetEnhancementModel,
+        s2t_model: Union[ESPnetASRModel, ESPnetSTModel],
+        calc_enh_loss: bool = True,
+        bypass_enh_prob: float = 0,  # 0 means do not bypass enhancement for all data
+    ):
+        assert check_argument_types()
+
+        super().__init__()
+        self.enh_model = enh_model
+        self.s2t_model = s2t_model  # ASR or ST model
+
+        self.bypass_enh_prob = bypass_enh_prob
+
+        self.calc_enh_loss = calc_enh_loss
+        self.extract_feats_in_collect_stats = (
+            self.s2t_model.extract_feats_in_collect_stats
+        )
+
+    def forward(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
+        """Frontend + Encoder + Decoder + Calc loss
+
+        Args:
+            speech: (Batch, Length, ...)
+            speech_lengths: (Batch, )
+            text: (Batch, Length)
+            text_lengths: (Batch,)
+        """
+        assert text_lengths.dim() == 1, text_lengths.shape
+        # Check that batch_size is unified
+        assert (
+            speech.shape[0]
+            == speech_lengths.shape[0]
+            == text.shape[0]
+            == text_lengths.shape[0]
+        ), (speech.shape, speech_lengths.shape, text.shape, text_lengths.shape)
+
+        # additional checks with valid src_text
+        if "src_text" in kwargs:
+            src_text = kwargs["src_text"]
+            src_text_lengths = kwargs["src_text_lengths"]
+
+            if src_text is not None:
+                assert src_text_lengths.dim() == 1, src_text_lengths.shape
+                assert (
+                    text.shape[0] == src_text.shape[0] == src_text_lengths.shape[0]
+                ), (
+                    text.shape,
+                    src_text.shape,
+                    src_text_lengths.shape,
+                )
+        else:
+            src_text = None
+            src_text_lengths = None
+
+        batch_size = speech.shape[0]
+
+        # clean speech signal
+        speech_ref = None
+        if self.calc_enh_loss:
+            assert "speech_ref1" in kwargs
+            speech_ref = [kwargs["speech_ref1"]]  # [(Batch, samples)] x num_spkr
+
+        # Calculating enhancement loss
+        utt_id = kwargs.get("utt_id", None)
+        bypass_enh_flag, skip_enhloss_flag = False, False
+        if utt_id is not None:
+            # TODO(xkc): to pass category info and use predefined category list
+            if utt_id[0].endswith("SIMU"):
+                # For simulated single-/multi-speaker data
+                # feed it to Enhancement and calculate loss_enh
+                bypass_enh_flag = False
+                skip_enhloss_flag = False
+            elif utt_id[0].endswith("REAL"):
+                # For single-speaker real data
+                # feed it to Enhancement but without calculating loss_enh
+                bypass_enh_flag = False
+                skip_enhloss_flag = True
+            else:
+                # For clean data
+                # feed it to Enhancement, without calculating loss_enh
+                bypass_enh_flag = True
+                skip_enhloss_flag = True
+
+        if not self.calc_enh_loss:
+            skip_enhloss_flag = True
+
+        # Bypass the enhancement module
+        if (
+            self.training and skip_enhloss_flag and not bypass_enh_flag
+        ):  # For single-speaker real data: possibility to bypass frontend
+            if random.random() <= self.bypass_enh_prob:
+                bypass_enh_flag = True
+
+        # 1. Enhancement
+        # model forward
+        loss_enh = None
+        if not bypass_enh_flag:
+            (
+                speech_pre,
+                feature_mix,
+                feature_pre,
+                others,
+            ) = self.enh_model.forward_enhance(speech, speech_lengths)
+            # loss computation
+            if not skip_enhloss_flag:
+                loss_enh, _, _ = self.enh_model.forward_loss(
+                    speech_pre,
+                    speech_lengths,
+                    feature_mix,
+                    feature_pre,
+                    others,
+                    speech_ref,
+                )
+                loss_enh = loss_enh[0]
+        else:
+            speech_pre = [speech]
+
+        # for data-parallel
+        text = text[:, : text_lengths.max()]
+        if src_text is not None:
+            src_text = src_text[:, : src_text_lengths.max()]
+
+        # 2. ASR or ST
+        if isinstance(self.s2t_model, ESPnetASRModel):  # ASR
+            loss_asr, stats, weight = self.s2t_model(
+                speech_pre[0], speech_lengths, text, text_lengths
+            )
+        elif isinstance(self.s2t_model, ESPnetSTModel):  # ST
+            loss_asr, stats, weight = self.s2t_model(
+                speech_pre[0],
+                speech_lengths,
+                text,
+                text_lengths,
+                src_text,
+                src_text_lengths,
+            )
+        else:
+            raise NotImplementedError(f"{type(self.s2t_model)} is not supported yet.")
+
+        if loss_enh is not None:
+            loss = loss_enh + loss_asr
+        else:
+            loss = loss_asr
+
+        stats["loss"] = loss.detach() if loss is not None else None
+        stats["loss_enh"] = loss_enh.detach() if loss_enh is not None else None
+
+        # force_gatherable: to-device and to-tensor if scalar for DataParallel
+        loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
+        return loss, stats, weight
+
+    def collect_feats(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+        **kwargs,
+    ) -> Dict[str, torch.Tensor]:
+        if self.extract_feats_in_collect_stats:
+            ret = self.s2t_model.collect_feats(
+                speech,
+                speech_lengths,
+                text,
+                text_lengths,
+                **kwargs,
+            )
+            feats, feats_lengths = ret["feats"], ret["feats_lengths"]
+        else:
+            # Generate dummy stats if extract_feats_in_collect_stats is False
+            logging.warning(
+                "Generating dummy stats for feats and feats_lengths, "
+                "because encoder_conf.extract_feats_in_collect_stats is "
+                f"{self.extract_feats_in_collect_stats}"
+            )
+            feats, feats_lengths = speech, speech_lengths
+        return {"feats": feats, "feats_lengths": feats_lengths}
+
+    def encode(
+        self, speech: torch.Tensor, speech_lengths: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Frontend + Encoder. Note that this method is used by asr_inference.py
+
+        Args:
+            speech: (Batch, Length, ...)
+            speech_lengths: (Batch, )
+        """
+        speech_pre, feature_mix, feature_pre, others = self.enh_model.forward_enhance(
+            speech, speech_lengths
+        )
+        encoder_out, encoder_out_lens = self.s2t_model.encode(
+            speech_pre[0], speech_lengths
+        )
+
+        return encoder_out, encoder_out_lens
+
+    def nll(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_out_lens: torch.Tensor,
+        ys_pad: torch.Tensor,
+        ys_pad_lens: torch.Tensor,
+    ) -> torch.Tensor:
+        """Compute negative log likelihood(nll) from transformer-decoder
+
+        Normally, this function is called in batchify_nll.
+
+        Args:
+            encoder_out: (Batch, Length, Dim)
+            encoder_out_lens: (Batch,)
+            ys_pad: (Batch, Length)
+            ys_pad_lens: (Batch,)
+        """
+        return self.s2t_model.nll(
+            encoder_out,
+            encoder_out_lens,
+            ys_pad,
+            ys_pad_lens,
+        )
+
+    batchify_nll = ESPnetASRModel.batchify_nll
+
+    def inherite_attributes(
+        self,
+        inherite_enh_attrs: List[str] = [],
+        inherite_s2t_attrs: List[str] = [],
+    ):
+        assert check_argument_types()
+
+        if len(inherite_enh_attrs) > 0:
+            for attr in inherite_enh_attrs:
+                setattr(self, attr, getattr(self.enh_model, attr, None))
+        if len(inherite_s2t_attrs) > 0:
+            for attr in inherite_s2t_attrs:
+                setattr(self, attr, getattr(self.s2t_model, attr, None))
diff --git a/espnet2/enh/espnet_model.py b/espnet2/enh/espnet_model.py
index e75e7ec0216..75bb57094f4 100644
--- a/espnet2/enh/espnet_model.py
+++ b/espnet2/enh/espnet_model.py
@@ -3,6 +3,7 @@
 from typing import Dict
 from typing import List
 from typing import Optional
+from typing import OrderedDict
 from typing import Tuple
 
 import torch
@@ -14,6 +15,7 @@
 from espnet2.enh.loss.criterions.time_domain import TimeDomainLoss
 from espnet2.enh.loss.wrappers.abs_wrapper import AbsLossWrapper
 from espnet2.enh.separator.abs_separator import AbsSeparator
+from espnet2.enh.separator.dan_separator import DANSeparator
 from espnet2.torch_utils.device_funcs import force_gatherable
 from espnet2.train.abs_espnet_model import AbsESPnetModel
 
@@ -58,7 +60,7 @@ def __init__(
         # while enforcing STFT consistency (deprecated, keep for compatibility)
         self.stft_consistency = stft_consistency
 
-        # for multi-channel signal (deprecated, keep for compatibility)
+        # for multi-channel signal
         self.ref_channel = getattr(self.separator, "ref_channel", -1)
 
     def forward(
@@ -77,6 +79,7 @@ def forward(
                             because the chunk-iterator does not have the
                             speech_lengths returned. see in
                             espnet2/iterators/chunk_iter_factory.py
+            kwargs: "utt_id" is among the input.
         """
         # clean speech signal of each speaker
         speech_ref = [
@@ -86,8 +89,8 @@ def forward(
         speech_ref = torch.stack(speech_ref, dim=1)
 
         if "noise_ref1" in kwargs:
-            # noise signal (optional, required when using
-            # frontend models with beamformering)
+            # noise signal (optional, required when using beamforming-based
+            # frontend models)
             noise_ref = [
                 kwargs["noise_ref{}".format(n + 1)] for n in range(self.num_noise_type)
             ]
@@ -132,19 +135,60 @@ def forward(
         # for data-parallel
         speech_ref = speech_ref[..., : speech_lengths.max()]
         speech_ref = speech_ref.unbind(dim=1)
+        additional = {}
+        # Additional data is required in Deep Attractor Network
+        if isinstance(self.separator, DANSeparator):
+            additional["feature_ref"] = [
+                self.encoder(r, speech_lengths)[0] for r in speech_ref
+            ]
 
         speech_mix = speech_mix[:, : speech_lengths.max()]
 
         # model forward
+        speech_pre, feature_mix, feature_pre, others = self.forward_enhance(
+            speech_mix, speech_lengths, additional
+        )
+
+        # loss computation
+        loss, stats, weight = self.forward_loss(
+            speech_pre,
+            speech_lengths,
+            feature_mix,
+            feature_pre,
+            others,
+            speech_ref,
+            noise_ref,
+            dereverb_speech_ref,
+        )
+        return loss, stats, weight
+
+    def forward_enhance(
+        self,
+        speech_mix: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        additional: Optional[Dict] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         feature_mix, flens = self.encoder(speech_mix, speech_lengths)
-        feature_pre, flens, others = self.separator(feature_mix, flens)
+        feature_pre, flens, others = self.separator(feature_mix, flens, additional)
         if feature_pre is not None:
             speech_pre = [self.decoder(ps, speech_lengths)[0] for ps in feature_pre]
         else:
             # some models (e.g. neural beamformer trained with mask loss)
             # do not predict time-domain signal in the training stage
             speech_pre = None
+        return speech_pre, feature_mix, feature_pre, others
 
+    def forward_loss(
+        self,
+        speech_pre: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        feature_mix: torch.Tensor,
+        feature_pre: torch.Tensor,
+        others: OrderedDict,
+        speech_ref: torch.Tensor,
+        noise_ref: torch.Tensor = None,
+        dereverb_speech_ref: torch.Tensor = None,
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
         loss = 0.0
         stats = dict()
         o = {}
@@ -156,14 +200,19 @@ def forward(
                     # only select one channel as the reference
                     speech_ref = [sr[..., self.ref_channel] for sr in speech_ref]
                 # for the time domain criterions
-                l, s, o = loss_wrapper(speech_ref, speech_pre, o)
+                l, s, o = loss_wrapper(speech_ref, speech_pre, others)
             elif isinstance(criterion, FrequencyDomainLoss):
                 # for the time-frequency domain criterions
                 if criterion.compute_on_mask:
-                    # compute on mask
+                    # compute loss on masks
+                    if noise_ref is not None:
+                        noise_spec = self.encoder(noise_ref.sum(1), speech_lengths)[0]
+                    else:
+                        noise_spec = None
                     tf_ref = criterion.create_mask_label(
                         feature_mix,
                         [self.encoder(sr, speech_lengths)[0] for sr in speech_ref],
+                        noise_spec=noise_spec,
                     )
                     tf_pre = [
                         others["mask_spk{}".format(spk + 1)]
@@ -178,13 +227,17 @@ def forward(
                     tf_ref = [self.encoder(sr, speech_lengths)[0] for sr in speech_ref]
                     tf_pre = feature_pre
 
-                l, s, o = loss_wrapper(tf_ref, tf_pre, o)
+                l, s, o = loss_wrapper(tf_ref, tf_pre, others)
+            else:
+                raise NotImplementedError("Unsupported loss type: %s" % str(criterion))
+
             loss += l * loss_wrapper.weight
             stats.update(s)
 
         stats["loss"] = loss.detach()
 
         # force_gatherable: to-device and to-tensor if scalar for DataParallel
+        batch_size = speech_ref[0].shape[0]
         loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
         return loss, stats, weight
 
diff --git a/espnet2/enh/layers/beamformer.py b/espnet2/enh/layers/beamformer.py
index 060cf2d2b67..e3d61d2489f 100644
--- a/espnet2/enh/layers/beamformer.py
+++ b/espnet2/enh/layers/beamformer.py
@@ -4,11 +4,12 @@
 from typing import Optional
 from typing import Union
 
-import numpy as np
 import torch
 from torch_complex import functional as FC
 from torch_complex.tensor import ComplexTensor
 
+from espnet2.enh.layers.complex_utils import cat
+from espnet2.enh.layers.complex_utils import complex_norm
 from espnet2.enh.layers.complex_utils import einsum
 from espnet2.enh.layers.complex_utils import inverse
 from espnet2.enh.layers.complex_utils import is_complex
@@ -16,19 +17,162 @@
 from espnet2.enh.layers.complex_utils import matmul
 from espnet2.enh.layers.complex_utils import reverse
 from espnet2.enh.layers.complex_utils import solve
+from espnet2.enh.layers.complex_utils import to_double
 
 
 is_torch_1_9_plus = LooseVersion(torch.__version__) >= LooseVersion("1.9.0")
 EPS = torch.finfo(torch.double).eps
 
 
+def prepare_beamformer_stats(
+    signal,
+    masks_speech,
+    mask_noise,
+    powers=None,
+    beamformer_type="mvdr",
+    bdelay=3,
+    btaps=5,
+    eps=1e-6,
+):
+    """Prepare necessary statistics for constructing the specified beamformer.
+
+    Args:
+        signal (torch.complex64/ComplexTensor): (..., F, C, T)
+        masks_speech (List[torch.Tensor]): (..., F, C, T) masks for all speech sources
+        mask_noise (torch.Tensor): (..., F, C, T) noise mask
+        powers (List[torch.Tensor]): powers for all speech sources (..., F, T)
+                                     used for wMPDR or WPD beamformers
+        beamformer_type (str): one of the pre-defined beamformer types
+        bdelay (int): delay factor, used for WPD beamformser
+        btaps (int): number of filter taps, used for WPD beamformser
+        eps (torch.Tensor): tiny constant
+    Returns:
+        beamformer_stats (dict): a dictionary containing all necessary statistics
+            e.g. "psd_n", "psd_speech", "psd_distortion"
+            Note:
+            * When `masks_speech` is a tensor or a single-element list, all returned
+              statistics are tensors;
+            * When `masks_speech` is a multi-element list, some returned statistics
+              can be a list, e.g., "psd_n" for MVDR, "psd_speech" and "psd_distortion".
+
+    """
+    from espnet2.enh.layers.dnn_beamformer import BEAMFORMER_TYPES
+
+    assert beamformer_type in BEAMFORMER_TYPES, "%s is not supported yet"
+
+    if isinstance(masks_speech, (list, tuple)):
+        masks_speech = [to_double(m) for m in masks_speech]
+    else:
+        masks_speech = [to_double(masks_speech)]
+    num_spk = len(masks_speech)
+
+    if (
+        beamformer_type.startswith("wmpdr")
+        or beamformer_type.startswith("wpd")
+        or beamformer_type == "wlcmp"
+        or beamformer_type == "wmwf"
+    ):
+        if powers is None:
+            power_input = signal.real**2 + signal.imag**2
+            # Averaging along the channel axis: (..., C, T) -> (..., T)
+            powers = [(power_input * m).mean(dim=-2) for m in masks_speech]
+        else:
+            assert len(powers) == num_spk, (len(powers), num_spk)
+        inverse_powers = [1 / torch.clamp(p, min=eps) for p in powers]
+
+    psd_speeches = [get_power_spectral_density_matrix(signal, m) for m in masks_speech]
+    if (
+        beamformer_type == "mvdr_souden"
+        or beamformer_type == "sdw_mwf"
+        or beamformer_type == "r1mwf"
+        or beamformer_type.startswith("mvdr_tfs")
+        or not beamformer_type.endswith("_souden")
+    ):
+        # MVDR or other RTF-based formulas
+        if mask_noise is not None:
+            psd_bg = get_power_spectral_density_matrix(signal, to_double(mask_noise))
+        if num_spk == 1:
+            assert mask_noise is not None
+            psd_noise = psd_bg
+        else:
+            psd_noise = []
+            for i in range(num_spk):
+                if beamformer_type.startswith("mvdr_tfs"):
+                    # NOTE: psd_noise is a list only for this beamformer
+                    psd_noise_i = [psd for j, psd in enumerate(psd_speeches) if j != i]
+                else:
+                    psd_sum = sum(psd for j, psd in enumerate(psd_speeches) if j != i)
+                    psd_noise_i = (
+                        psd_bg + psd_sum if mask_noise is not None else psd_sum
+                    )
+                psd_noise.append(psd_noise_i)
+
+    if beamformer_type in (
+        "mvdr",
+        "mvdr_souden",
+        "mvdr_tfs_souden",
+        "sdw_mwf",
+        "r1mwf",
+        "lcmv",
+        "gev",
+        "gev_ban",
+    ):
+        psd_n = psd_noise
+    elif beamformer_type == "mvdr_tfs":
+        psd_n = psd_noise
+        psd_noise = [sum(psd_noise_i) for psd_noise_i in psd_noise]
+    elif beamformer_type in ("mpdr", "mpdr_souden", "lcmp", "mwf"):
+        psd_n = einsum("...ct,...et->...ce", signal, signal.conj())
+    elif beamformer_type in ("wmpdr", "wmpdr_souden", "wlcmp", "wmwf"):
+        psd_n = [
+            einsum(
+                "...ct,...et->...ce",
+                signal * inv_p[..., None, :],
+                signal.conj(),
+            )
+            for inv_p in inverse_powers
+        ]
+    elif beamformer_type in ("wpd", "wpd_souden"):
+        psd_n = [
+            get_covariances(signal, inv_p, bdelay, btaps, get_vector=False)
+            for inv_p in inverse_powers
+        ]
+
+    if num_spk == 1:
+        psd_speeches = psd_speeches[0]
+        if isinstance(psd_n, (list, tuple)):
+            psd_n = psd_n[0]
+
+    if beamformer_type in (
+        "mvdr",
+        "mpdr",
+        "wmpdr",
+        "wpd",
+        "lcmp",
+        "wlcmp",
+        "lcmv",
+        "mvdr_tfs",
+    ):
+        return {"psd_n": psd_n, "psd_speech": psd_speeches, "psd_distortion": psd_noise}
+    elif (
+        beamformer_type.endswith("_souden")
+        or beamformer_type.startswith("gev")
+        or beamformer_type == "mwf"
+        or beamformer_type == "wmwf"
+        or beamformer_type == "sdw_mwf"
+        or beamformer_type == "r1mwf"
+    ):
+        return {"psd_n": psd_n, "psd_speech": psd_speeches}
+
+
 def get_power_spectral_density_matrix(
-    xs, mask: torch.Tensor, normalization=True, eps: float = 1e-15
+    xs, mask, normalization=True, reduction="mean", eps: float = 1e-15
 ):
     """Return cross-channel power spectral density (PSD) matrix
 
     Args:
         xs (torch.complex64/ComplexTensor): (..., F, C, T)
+        reduction (str): "mean" or "median"
         mask (torch.Tensor): (..., F, C, T)
         normalization (bool):
         eps (float):
@@ -36,11 +180,13 @@ def get_power_spectral_density_matrix(
         psd (torch.complex64/ComplexTensor): (..., F, C, C)
 
     """
-    # outer product: (..., C_1, T) x (..., C_2, T) -> (..., T, C, C_2)
-    psd_Y = einsum("...ct,...et->...tce", xs, xs.conj())
-
-    # Averaging mask along C: (..., C, T) -> (..., T)
-    mask = mask.mean(dim=-2)
+    if reduction == "mean":
+        # Averaging mask along C: (..., C, T) -> (..., 1, T)
+        mask = mask.mean(dim=-2, keepdim=True)
+    elif reduction == "median":
+        mask = mask.median(dim=-2, keepdim=True)
+    else:
+        raise ValueError("Unknown reduction mode: %s" % reduction)
 
     # Normalized mask along T: (..., T)
     if normalization:
@@ -48,10 +194,8 @@ def get_power_spectral_density_matrix(
         # the time axis is same regardless of the padding length.
         mask = mask / (mask.sum(dim=-1, keepdim=True) + eps)
 
-    # psd: (..., T, C, C)
-    psd = psd_Y * mask[..., None, None]
-    # (..., T, C, C) -> (..., C, C)
-    psd = psd.sum(dim=-3)
+    # outer product: (..., C_1, T) x (..., C_2, T) -> (..., C, C_2)
+    psd = einsum("...ct,...et->...ce", xs * mask, xs.conj())
 
     return psd
 
@@ -59,13 +203,14 @@ def get_power_spectral_density_matrix(
 def get_rtf(
     psd_speech,
     psd_noise,
-    reference_vector: Union[int, torch.Tensor, None] = None,
+    mode="power",
+    reference_vector: Union[int, torch.Tensor] = 0,
     iterations: int = 3,
     use_torch_solver: bool = True,
 ):
-    """Calculate the relative transfer function (RTF) using the power method.
+    """Calculate the relative transfer function (RTF)
 
-    Algorithm:
+    Algorithm of power method:
         1) rtf = reference_vector
         2) for i in range(iterations):
              rtf = (psd_noise^-1 @ psd_speech) @ rtf
@@ -79,25 +224,39 @@ def get_rtf(
             speech covariance matrix (..., F, C, C)
         psd_noise (torch.complex64/ComplexTensor):
             noise covariance matrix (..., F, C, C)
+        mode (str): one of ("power", "evd")
+            "power": power method
+            "evd": eigenvalue decomposition
         reference_vector (torch.Tensor or int): (..., C) or scalar
         iterations (int): number of iterations in power method
         use_torch_solver (bool): Whether to use `solve` instead of `inverse`
     Returns:
         rtf (torch.complex64/ComplexTensor): (..., F, C, 1)
     """
-    if use_torch_solver:
-        phi = solve(psd_speech, psd_noise)
+    if mode == "power":
+        if use_torch_solver:
+            phi = solve(psd_speech, psd_noise)
+        else:
+            phi = matmul(inverse(psd_noise), psd_speech)
+        rtf = (
+            phi[..., reference_vector, None]
+            if isinstance(reference_vector, int)
+            else matmul(phi, reference_vector[..., None, :, None])
+        )
+        for _ in range(iterations - 2):
+            rtf = matmul(phi, rtf)
+            # rtf = rtf / complex_norm(rtf, dim=-1, keepdim=True)
+        rtf = matmul(psd_speech, rtf)
+    elif mode == "evd":
+        assert (
+            is_torch_1_9_plus
+            and is_torch_complex_tensor(psd_speech)
+            and is_torch_complex_tensor(psd_noise)
+        )
+        e_vec = generalized_eigenvalue_decomposition(psd_speech, psd_noise)[1]
+        rtf = matmul(psd_noise, e_vec[..., -1, None])
     else:
-        phi = matmul(inverse(psd_noise), psd_speech)
-    rtf = (
-        phi[..., reference_vector, None]
-        if isinstance(reference_vector, int)
-        else matmul(phi, reference_vector[..., None, :, None])
-    )
-    for _ in range(iterations - 2):
-        rtf = matmul(phi, rtf)
-        # rtf = rtf / complex_norm(rtf)
-    rtf = matmul(psd_speech, rtf)
+        raise ValueError("Unknown mode: %s" % mode)
     return rtf
 
 
@@ -194,7 +353,7 @@ def get_mvdr_vector_with_rtf(
     rtf = get_rtf(
         psd_speech,
         psd_noise,
-        reference_vector,
+        reference_vector=reference_vector,
         iterations=iterations,
         use_torch_solver=use_torch_solver,
     )
@@ -222,6 +381,467 @@ def apply_beamforming_vector(
     return es
 
 
+def get_mwf_vector(
+    psd_s,
+    psd_n,
+    reference_vector: Union[torch.Tensor, int],
+    use_torch_solver: bool = True,
+    diagonal_loading: bool = True,
+    diag_eps: float = 1e-7,
+    eps: float = 1e-8,
+):
+    """Return the MWF (Minimum Multi-channel Wiener Filter) vector:
+
+        h = (Npsd^-1 @ Spsd) @ u
+
+    Args:
+        psd_s (torch.complex64/ComplexTensor):
+            speech covariance matrix (..., F, C, C)
+        psd_n (torch.complex64/ComplexTensor):
+            power-normalized observation covariance matrix (..., F, C, C)
+        reference_vector (torch.Tensor or int): (..., C) or scalar
+        use_torch_solver (bool): Whether to use `solve` instead of `inverse`
+        diagonal_loading (bool): Whether to add a tiny term to the diagonal of psd_n
+        diag_eps (float):
+        eps (float):
+    Returns:
+        beamform_vector (torch.complex64/ComplexTensor): (..., F, C)
+    """  # noqa: D400
+    if diagonal_loading:
+        psd_n = tik_reg(psd_n, reg=diag_eps, eps=eps)
+
+    if use_torch_solver:
+        ws = solve(psd_s, psd_n)
+    else:
+        ws = matmul(inverse(psd_n), psd_s)
+    # h: (..., F, C_1, C_2) x (..., C_2) -> (..., F, C_1)
+    if isinstance(reference_vector, int):
+        beamform_vector = ws[..., reference_vector]
+    else:
+        beamform_vector = einsum("...fec,...c->...fe", ws, reference_vector)
+    return beamform_vector
+
+
+def get_sdw_mwf_vector(
+    psd_speech,
+    psd_noise,
+    reference_vector: Union[torch.Tensor, int],
+    denoising_weight: float = 1.0,
+    approx_low_rank_psd_speech: bool = False,
+    iterations: int = 3,
+    use_torch_solver: bool = True,
+    diagonal_loading: bool = True,
+    diag_eps: float = 1e-7,
+    eps: float = 1e-8,
+):
+    """Return the SDW-MWF (Speech Distortion Weighted Multi-channel Wiener Filter) vector
+
+        h = (Spsd + mu * Npsd)^-1 @ Spsd @ u
+
+    Reference:
+        [1] Spatially pre-processed speech distortion weighted multi-channel Wiener
+        filtering for noise reduction; A. Spriet et al, 2004
+        https://dl.acm.org/doi/abs/10.1016/j.sigpro.2004.07.028
+        [2] Rank-1 constrained multichannel Wiener filter for speech recognition in
+        noisy environments; Z. Wang et al, 2018
+        https://hal.inria.fr/hal-01634449/document
+        [3] Low-rank approximation based multichannel Wiener filter algorithms for
+        noise reduction with application in cochlear implants; R. Serizel, 2014
+        https://ieeexplore.ieee.org/document/6730918
+
+    Args:
+        psd_speech (torch.complex64/ComplexTensor):
+            speech covariance matrix (..., F, C, C)
+        psd_noise (torch.complex64/ComplexTensor):
+            noise covariance matrix (..., F, C, C)
+        reference_vector (torch.Tensor or int): (..., C) or scalar
+        denoising_weight (float): a trade-off parameter between noise reduction and
+            speech distortion.
+            A larger value leads to more noise reduction at the expense of more speech
+            distortion.
+            The plain MWF is obtained with `denoising_weight = 1` (by default).
+        approx_low_rank_psd_speech (bool): whether to replace original input psd_speech
+            with its low-rank approximation as in [2]
+        iterations (int): number of iterations in power method, only used when
+            `approx_low_rank_psd_speech = True`
+        use_torch_solver (bool): Whether to use `solve` instead of `inverse`
+        diagonal_loading (bool): Whether to add a tiny term to the diagonal of psd_n
+        diag_eps (float):
+        eps (float):
+    Returns:
+        beamform_vector (torch.complex64/ComplexTensor): (..., F, C)
+    """  # noqa: H405, D205, D400
+    if approx_low_rank_psd_speech:
+        if diagonal_loading:
+            psd_noise = tik_reg(psd_noise, reg=diag_eps, eps=eps)
+
+        # (B, F, C, 1)
+        recon_vec = get_rtf(
+            psd_speech,
+            psd_noise,
+            mode="power",
+            iterations=iterations,
+            reference_vector=reference_vector,
+            use_torch_solver=use_torch_solver,
+        )
+        # Eq. (25) in Ref[2]
+        psd_speech_r1 = matmul(recon_vec, recon_vec.conj().transpose(-1, -2))
+        sigma_speech = FC.trace(psd_speech) / (FC.trace(psd_speech_r1) + eps)
+        psd_speech_r1 = psd_speech_r1 * sigma_speech[..., None, None]
+        # c.f. Eq. (62) in Ref[3]
+        psd_speech = psd_speech_r1
+
+    psd_n = psd_speech + denoising_weight * psd_noise
+    if diagonal_loading:
+        psd_n = tik_reg(psd_n, reg=diag_eps, eps=eps)
+
+    if use_torch_solver:
+        ws = solve(psd_speech, psd_n)
+    else:
+        ws = matmul(inverse(psd_n), psd_speech)
+
+    if isinstance(reference_vector, int):
+        beamform_vector = ws[..., reference_vector]
+    else:
+        beamform_vector = einsum("...fec,...c->...fe", ws, reference_vector)
+    return beamform_vector
+
+
+def get_rank1_mwf_vector(
+    psd_speech,
+    psd_noise,
+    reference_vector: Union[torch.Tensor, int],
+    denoising_weight: float = 1.0,
+    approx_low_rank_psd_speech: bool = False,
+    iterations: int = 3,
+    use_torch_solver: bool = True,
+    diagonal_loading: bool = True,
+    diag_eps: float = 1e-7,
+    eps: float = 1e-8,
+):
+    """Return the R1-MWF (Rank-1 Multi-channel Wiener Filter) vector
+
+        h = (Npsd^-1 @ Spsd) / (mu + Tr(Npsd^-1 @ Spsd)) @ u
+
+    Reference:
+        [1] Rank-1 constrained multichannel Wiener filter for speech recognition in
+        noisy environments; Z. Wang et al, 2018
+        https://hal.inria.fr/hal-01634449/document
+        [2] Low-rank approximation based multichannel Wiener filter algorithms for
+        noise reduction with application in cochlear implants; R. Serizel, 2014
+        https://ieeexplore.ieee.org/document/6730918
+
+    Args:
+        psd_speech (torch.complex64/ComplexTensor):
+            speech covariance matrix (..., F, C, C)
+        psd_noise (torch.complex64/ComplexTensor):
+            noise covariance matrix (..., F, C, C)
+        reference_vector (torch.Tensor or int): (..., C) or scalar
+        denoising_weight (float): a trade-off parameter between noise reduction and
+            speech distortion.
+            A larger value leads to more noise reduction at the expense of more speech
+            distortion.
+            When `denoising_weight = 0`, it corresponds to MVDR beamformer.
+        approx_low_rank_psd_speech (bool): whether to replace original input psd_speech
+            with its low-rank approximation as in [1]
+        iterations (int): number of iterations in power method, only used when
+            `approx_low_rank_psd_speech = True`
+        use_torch_solver (bool): Whether to use `solve` instead of `inverse`
+        diagonal_loading (bool): Whether to add a tiny term to the diagonal of psd_n
+        diag_eps (float):
+        eps (float):
+    Returns:
+        beamform_vector (torch.complex64/ComplexTensor): (..., F, C)
+    """  # noqa: H405, D205, D400
+    if approx_low_rank_psd_speech:
+        if diagonal_loading:
+            psd_noise = tik_reg(psd_noise, reg=diag_eps, eps=eps)
+
+        # (B, F, C, 1)
+        recon_vec = get_rtf(
+            psd_speech,
+            psd_noise,
+            mode="power",
+            iterations=iterations,
+            reference_vector=reference_vector,
+            use_torch_solver=use_torch_solver,
+        )
+        # Eq. (25) in Ref[1]
+        psd_speech_r1 = matmul(recon_vec, recon_vec.conj().transpose(-1, -2))
+        sigma_speech = FC.trace(psd_speech) / (FC.trace(psd_speech_r1) + eps)
+        psd_speech_r1 = psd_speech_r1 * sigma_speech[..., None, None]
+        # c.f. Eq. (62) in Ref[2]
+        psd_speech = psd_speech_r1
+    elif diagonal_loading:
+        psd_noise = tik_reg(psd_noise, reg=diag_eps, eps=eps)
+
+    if use_torch_solver:
+        numerator = solve(psd_speech, psd_noise)
+    else:
+        numerator = matmul(inverse(psd_noise), psd_speech)
+
+    # NOTE (wangyou): until PyTorch 1.9.0, torch.trace does not
+    # support bacth processing. Use FC.trace() as fallback.
+    # ws: (..., C, C) / (...,) -> (..., C, C)
+    ws = numerator / (denoising_weight + FC.trace(numerator)[..., None, None] + eps)
+
+    # h: (..., F, C_1, C_2) x (..., C_2) -> (..., F, C_1)
+    if isinstance(reference_vector, int):
+        beamform_vector = ws[..., reference_vector]
+    else:
+        beamform_vector = einsum("...fec,...c->...fe", ws, reference_vector)
+    return beamform_vector
+
+
+def get_rtf_matrix(
+    psd_speeches,
+    psd_noises,
+    diagonal_loading: bool = True,
+    ref_channel: int = 0,
+    rtf_iterations: int = 3,
+    use_torch_solver: bool = True,
+    diag_eps: float = 1e-7,
+    eps: float = 1e-8,
+):
+    """Calculate the RTF matrix with each column the relative transfer function
+    of the corresponding source.
+    """  # noqa: H405
+    assert isinstance(psd_speeches, list) and isinstance(psd_noises, list)
+    rtf_mat = cat(
+        [
+            get_rtf(
+                psd_speeches[spk],
+                tik_reg(psd_n, reg=diag_eps, eps=eps) if diagonal_loading else psd_n,
+                reference_vector=ref_channel,
+                iterations=rtf_iterations,
+                use_torch_solver=use_torch_solver,
+            )
+            for spk, psd_n in enumerate(psd_noises)
+        ],
+        dim=-1,
+    )
+    # normalize at the reference channel
+    return rtf_mat / rtf_mat[..., ref_channel, None, :]
+
+
+def get_lcmv_vector_with_rtf(
+    psd_n: Union[torch.Tensor, ComplexTensor],
+    rtf_mat: Union[torch.Tensor, ComplexTensor],
+    reference_vector: Union[int, torch.Tensor, None] = None,
+    use_torch_solver: bool = True,
+    diagonal_loading: bool = True,
+    diag_eps: float = 1e-7,
+    eps: float = 1e-8,
+) -> Union[torch.Tensor, ComplexTensor]:
+    """Return the LCMV (Linearly Constrained Minimum Variance) vector
+        calculated with RTF:
+
+        h = (Npsd^-1 @ rtf_mat) @ (rtf_mat^H @ Npsd^-1 @ rtf_mat)^-1 @ p
+
+    Reference:
+        H. L. Van Trees, “Optimum array processing: Part IV of detection, estimation,
+        and modulation theory,” John Wiley & Sons, 2004. (Chapter 6.7)
+
+    Args:
+        psd_n (torch.complex64/ComplexTensor):
+            observation/noise covariance matrix (..., F, C, C)
+        rtf_mat (torch.complex64/ComplexTensor):
+            RTF matrix (..., F, C, num_spk)
+        reference_vector (torch.Tensor or int): (..., num_spk) or scalar
+        use_torch_solver (bool): Whether to use `solve` instead of `inverse`
+        diagonal_loading (bool): Whether to add a tiny term to the diagonal of psd_n
+        diag_eps (float):
+        eps (float):
+    Returns:
+        beamform_vector (torch.complex64/ComplexTensor): (..., F, C)
+    """  # noqa: H405, D205, D400
+    if diagonal_loading:
+        psd_n = tik_reg(psd_n, reg=diag_eps, eps=eps)
+
+    # numerator: (..., C_1, C_2) x (..., C_2, num_spk) -> (..., C_1, num_spk)
+    if use_torch_solver:
+        numerator = solve(rtf_mat, psd_n)
+    else:
+        numerator = matmul(inverse(psd_n), rtf_mat)
+    denominator = matmul(rtf_mat.conj().transpose(-1, -2), numerator)
+    if isinstance(reference_vector, int):
+        ws = inverse(denominator)[..., reference_vector, None]
+    else:
+        ws = solve(reference_vector, denominator)
+    beamforming_vector = matmul(numerator, ws).squeeze(-1)
+    return beamforming_vector
+
+
+def generalized_eigenvalue_decomposition(a: torch.Tensor, b: torch.Tensor, eps=1e-6):
+    """Solves the generalized eigenvalue decomposition through Cholesky decomposition.
+
+    ported from https://github.com/asteroid-team/asteroid/blob/master/asteroid/dsp/beamforming.py#L464
+
+    a @ e_vec = e_val * b @ e_vec
+    |
+    |   Cholesky decomposition on `b`:
+    |       b = L @ L^H, where `L` is a lower triangular matrix
+    |
+    |   Let C = L^-1 @ a @ L^-H, it is Hermitian.
+    |
+    => C @ y = lambda * y
+    => e_vec = L^-H @ y
+
+    Reference: https://www.netlib.org/lapack/lug/node54.html
+
+    Args:
+        a: A complex Hermitian or real symmetric matrix whose eigenvalues and
+            eigenvectors will be computed. (..., C, C)
+        b: A complex Hermitian or real symmetric definite positive matrix. (..., C, C)
+    Returns:
+        e_val: generalized eigenvalues (ascending order)
+        e_vec: generalized eigenvectors
+    """  # noqa: H405, E501
+    try:
+        cholesky = torch.linalg.cholesky(b)
+    except RuntimeError:
+        b = tik_reg(b, reg=eps, eps=eps)
+        cholesky = torch.linalg.cholesky(b)
+    inv_cholesky = cholesky.inverse()
+    # Compute C matrix L⁻1 a L^-H
+    cmat = inv_cholesky @ a @ inv_cholesky.conj().transpose(-1, -2)
+    # Performing the eigenvalue decomposition
+    e_val, e_vec = torch.linalg.eigh(cmat)
+    # Collecting the eigenvectors
+    e_vec = torch.matmul(inv_cholesky.conj().transpose(-1, -2), e_vec)
+    return e_val, e_vec
+
+
+def gev_phase_correction(vector):
+    """Phase correction to reduce distortions due to phase inconsistencies.
+
+    ported from https://github.com/fgnt/nn-gev/blob/master/fgnt/beamforming.py#L169
+
+    Args:
+        vector: Beamforming vector with shape (..., F, C)
+    Returns:
+        w: Phase corrected beamforming vectors
+    """
+    B, F, C = vector.shape
+    correction = torch.empty_like(vector.real)
+    for f in range(F):
+        correction[:, f, :] = torch.exp(
+            (vector[:, f, :] * vector[:, f - 1, :].conj())
+            .sum(dim=-1, keepdim=True)
+            .angle()
+        )
+    if isinstance(vector, ComplexTensor):
+        correction = ComplexTensor(torch.cos(correction), -torch.sin(correction))
+    else:
+        correction = torch.exp(-1j * correction)
+    return vector * correction
+
+
+def blind_analytic_normalization(ws, psd_noise, eps=1e-8):
+    """Blind analytic normalization (BAN) for post-filtering
+
+    Args:
+        ws (torch.complex64/ComplexTensor): beamformer vector (..., F, C)
+        psd_noise (torch.complex64/ComplexTensor): noise PSD matrix (..., F, C, C)
+        eps (float)
+    Returns:
+        ws_ban (torch.complex64/ComplexTensor): normalized beamformer vector (..., F)
+    """
+    C2 = psd_noise.size(-1) ** 2
+    denominator = einsum("...c,...ce,...e->...", ws.conj(), psd_noise, ws)
+    numerator = einsum(
+        "...c,...ce,...eo,...o->...", ws.conj(), psd_noise, psd_noise, ws
+    )
+    gain = (numerator + eps).sqrt() / (denominator * C2 + eps)
+    return gain
+
+
+def get_gev_vector(
+    psd_noise: Union[torch.Tensor, ComplexTensor],
+    psd_speech: Union[torch.Tensor, ComplexTensor],
+    mode="power",
+    reference_vector: Union[int, torch.Tensor] = 0,
+    iterations: int = 3,
+    use_torch_solver: bool = True,
+    diagonal_loading: bool = True,
+    diag_eps: float = 1e-7,
+    eps: float = 1e-8,
+) -> Union[torch.Tensor, ComplexTensor]:
+    """Return the generalized eigenvalue (GEV) beamformer vector:
+
+        psd_speech @ h = lambda * psd_noise @ h
+
+    Reference:
+        Blind acoustic beamforming based on generalized eigenvalue decomposition;
+        E. Warsitz and R. Haeb-Umbach, 2007.
+
+    Args:
+        psd_noise (torch.complex64/ComplexTensor):
+            noise covariance matrix (..., F, C, C)
+        psd_speech (torch.complex64/ComplexTensor):
+            speech covariance matrix (..., F, C, C)
+        mode (str): one of ("power", "evd")
+            "power": power method
+            "evd": eigenvalue decomposition (only for torch builtin complex tensors)
+        reference_vector (torch.Tensor or int): (..., C) or scalar
+        iterations (int): number of iterations in power method
+        use_torch_solver (bool): Whether to use `solve` instead of `inverse`
+        diagonal_loading (bool): Whether to add a tiny term to the diagonal of psd_n
+        diag_eps (float):
+        eps (float):
+    Returns:
+        beamform_vector (torch.complex64/ComplexTensor): (..., F, C)
+    """  # noqa: H405, D205, D400
+    if diagonal_loading:
+        psd_noise = tik_reg(psd_noise, reg=diag_eps, eps=eps)
+
+    if mode == "power":
+        if use_torch_solver:
+            phi = solve(psd_speech, psd_noise)
+        else:
+            phi = matmul(inverse(psd_noise), psd_speech)
+        e_vec = (
+            phi[..., reference_vector, None]
+            if isinstance(reference_vector, int)
+            else matmul(phi, reference_vector[..., None, :, None])
+        )
+        for _ in range(iterations - 1):
+            e_vec = matmul(phi, e_vec)
+            # e_vec = e_vec / complex_norm(e_vec, dim=-1, keepdim=True)
+        e_vec = e_vec.squeeze(-1)
+    elif mode == "evd":
+        assert (
+            is_torch_1_9_plus
+            and is_torch_complex_tensor(psd_speech)
+            and is_torch_complex_tensor(psd_noise)
+        )
+        # e_vec = generalized_eigenvalue_decomposition(psd_speech, psd_noise)[1][...,-1]
+        e_vec = psd_noise.new_zeros(psd_noise.shape[:-1])
+        for f in range(psd_noise.shape[-3]):
+            try:
+                e_vec[..., f, :] = generalized_eigenvalue_decomposition(
+                    psd_speech[..., f, :, :], psd_noise[..., f, :, :]
+                )[1][..., -1]
+            except RuntimeError:
+                # port from github.com/fgnt/nn-gev/blob/master/fgnt/beamforming.py#L106
+                print(
+                    "GEV beamformer: LinAlg error for frequency {}".format(f),
+                    flush=True,
+                )
+                C = psd_noise.size(-1)
+                e_vec[..., f, :] = (
+                    psd_noise.new_ones(e_vec[..., f, :].shape)
+                    / FC.trace(psd_noise[..., f, :, :])
+                    * C
+                )
+    else:
+        raise ValueError("Unknown mode: %s" % mode)
+
+    beamforming_vector = e_vec / complex_norm(e_vec, dim=-1, keepdim=True)
+    beamforming_vector = gev_phase_correction(beamforming_vector)
+    return beamforming_vector
+
+
 def signal_framing(
     signal: Union[torch.Tensor, ComplexTensor],
     frame_length: int,
@@ -521,7 +1141,7 @@ def get_WPD_filter_with_rtf(
     rtf = get_rtf(
         psd_speech,
         psd_noise,
-        reference_vector,
+        reference_vector=reference_vector,
         iterations=iterations,
         use_torch_solver=use_torch_solver,
     )
@@ -590,153 +1210,3 @@ def tik_reg(mat, reg: float = 1e-8, eps: float = 1e-8):
         epsilon = epsilon + eps
     mat = mat + epsilon * eye
     return mat
-
-
-##############################################
-# Below are for Multi-Frame MVDR beamforming #
-##############################################
-# modified from https://gitlab.uni-oldenburg.de/hura4843/deep-mfmvdr/-/blob/master/deep_mfmvdr (# noqa: E501)
-def get_adjacent(spec, filter_length: int = 5):
-    """Zero-pad and unfold stft, i.e.,
-
-    add zeros to the beginning so that, using the multi-frame signal model,
-    there will be as many output frames as input frames.
-
-    Args:
-        spec (torch.complex64/ComplexTensor): input spectrum (B, F, T)
-        filter_length (int): length for frame extension
-    Returns:
-        ret (torch.complex64/ComplexTensor): output spectrum (B, F, T, filter_length)
-    """  # noqa: D400
-    if isinstance(spec, ComplexTensor):
-        pad_func = FC.pad
-    elif is_torch_complex_tensor(spec):
-        pad_func = torch.nn.functional.pad
-    else:
-        raise ValueError(
-            "Please update your PyTorch version to 1.9+ for complex support."
-        )
-    return (
-        pad_func(spec, pad=[filter_length - 1, 0])
-        .unfold(dim=-1, size=filter_length, step=1)
-        .contiguous()
-    )
-
-
-def get_adjacent_th(spec: torch.Tensor, filter_length: int = 5) -> torch.Tensor:
-    """Zero-pad and unfold stft, i.e.,
-
-    add zeros to the beginning so that, using the multi-frame signal model,
-    there will be as many output frames as input frames.
-
-    Args:
-        spec (torch.Tensor): input spectrum (B, F, T, 2)
-        filter_length (int): length for frame extension
-    Returns:
-        ret (torch.Tensor): output spectrum (B, F, T, filter_length, 2)
-    """  # noqa: D400
-    return (
-        torch.nn.functional.pad(spec, pad=[0, 0, filter_length - 1, 0])
-        .unfold(dimension=-2, size=filter_length, step=1)
-        .transpose(-2, -1)
-        .contiguous()
-    )
-
-
-def vector_to_Hermitian(vec, use_builtin_complex=False):
-    """Construct a Hermitian matrix from a vector of N**2 independent
-    real-valued elements.
-
-    Args:
-        vec (torch.Tensor): (..., N ** 2)
-        use_builtin_complex (bool): Whether to use builtin complex support
-    Returns:
-        mat (torch.complex64/ComplexTensor): (..., N, N)
-    """  # noqa: H405, D205, D400
-    N = int(np.sqrt(vec.shape[-1]))
-    mat = torch.zeros(size=vec.shape[:-1] + (N, N, 2), device=vec.device)
-
-    # real component
-    triu = np.triu_indices(N, 0)
-    triu2 = np.triu_indices(N, 1)  # above main diagonal
-    tril = (triu2[1], triu2[0])  # below main diagonal; for symmetry
-    mat[(...,) + triu + (np.zeros(triu[0].shape[0]),)] = vec[..., : triu[0].shape[0]]
-    start = triu[0].shape[0]
-    mat[(...,) + tril + (np.zeros(tril[0].shape[0]),)] = mat[
-        (...,) + triu2 + (np.zeros(triu2[0].shape[0]),)
-    ]
-
-    # imaginary component
-    mat[(...,) + triu2 + (np.ones(triu2[0].shape[0]),)] = vec[
-        ..., start : start + triu2[0].shape[0]
-    ]
-    mat[(...,) + tril + (np.ones(tril[0].shape[0]),)] = -mat[
-        (...,) + triu2 + (np.ones(triu2[0].shape[0]),)
-    ]
-
-    if is_torch_1_9_plus and use_builtin_complex:
-        return torch.complex(mat[..., 0], mat[..., 1])
-    else:
-        return ComplexTensor(mat[..., 0], mat[..., 1])
-
-
-def get_mfmvdr_vector(gammax, Phi, use_torch_solver: bool = True, eps: float = EPS):
-    """Compute conventional MFMPDR/MFMVDR filter.
-
-    Args:
-        gammax (torch.complex64/ComplexTensor): (..., L, N)
-        Phi (torch.complex64/ComplexTensor): (..., L, N, N)
-        use_torch_solver (bool): Whether to use `solve` instead of `inverse`
-        eps (float)
-    Returns:
-        beamforming_vector (torch.complex64/ComplexTensor): (..., L, N)
-    """
-    # (..., L, N)
-    if use_torch_solver:
-        numerator = solve(gammax.unsqueeze(-1), Phi).squeeze(-1)
-    else:
-        numerator = matmul(inverse(Phi), gammax.unsqueeze(-1)).squeeze(-1)
-    denominator = einsum("...d,...d->...", gammax.conj(), numerator)
-    return numerator / (denominator.real.unsqueeze(-1) + eps)
-
-
-def filter_minimum_gain_like(
-    G_min, w, y, alpha=None, k: float = 10.0, eps: float = EPS
-):
-    """Approximate a minimum gain operation.
-
-    speech_estimate = alpha w^H y + (1 - alpha) G_min Y,
-    where alpha = 1 / (1 + exp(-2 k x)), x = w^H y - G_min Y
-
-    Args:
-        G_min (float): minimum gain
-        w (torch.complex64/ComplexTensor): filter coefficients (..., L, N)
-        y (torch.complex64/ComplexTensor): buffered and stacked input (..., L, N)
-        alpha: mixing factor
-        k (float): scaling in tanh-like function
-        esp (float)
-    Returns:
-        output (torch.complex64/ComplexTensor): minimum gain-filtered output
-        alpha (float): optional
-    """
-    # (..., L)
-    filtered_input = einsum("...d,...d->...", [w.conj(), y])
-    # (..., L)
-    Y = y[..., -1]
-    return minimum_gain_like(G_min, Y, filtered_input, alpha, k, eps)
-
-
-def minimum_gain_like(
-    G_min, Y, filtered_input, alpha=None, k: float = 10.0, eps: float = EPS
-):
-    if alpha is None:
-        diff = (filtered_input + eps).abs() - (G_min * Y + eps).abs()
-        alpha = 1.0 / (1.0 + torch.exp(-2 * k * diff))
-        return_alpha = True
-    else:
-        return_alpha = False
-    output = alpha * filtered_input + (1 - alpha) * G_min * Y
-    if return_alpha:
-        return output, alpha
-    else:
-        return output
diff --git a/espnet2/enh/layers/complex_utils.py b/espnet2/enh/layers/complex_utils.py
index d6ecf7947d7..acfbe2f61a8 100644
--- a/espnet2/enh/layers/complex_utils.py
+++ b/espnet2/enh/layers/complex_utils.py
@@ -1,5 +1,6 @@
 """Beamformer module."""
 from distutils.version import LooseVersion
+from typing import Sequence
 from typing import Tuple
 from typing import Union
 
@@ -28,7 +29,9 @@ def new_complex_like(
 
 
 def is_torch_complex_tensor(c):
-    return is_torch_1_9_plus and torch.is_complex(c)
+    return (
+        not isinstance(c, ComplexTensor) and is_torch_1_9_plus and torch.is_complex(c)
+    )
 
 
 def is_complex(c):
@@ -49,10 +52,32 @@ def to_float(c):
         return c.float()
 
 
-def complex_norm(c: Union[torch.Tensor, ComplexTensor]) -> torch.Tensor:
+def cat(seq: Sequence[Union[ComplexTensor, torch.Tensor]], *args, **kwargs):
+    if not isinstance(seq, (list, tuple)):
+        raise TypeError(
+            "cat(): argument 'tensors' (position 1) must be tuple of Tensors, "
+            "not Tensor"
+        )
+    if isinstance(seq[0], ComplexTensor):
+        return FC.cat(seq, *args, **kwargs)
+    else:
+        return torch.cat(seq, *args, **kwargs)
+
+
+def complex_norm(
+    c: Union[torch.Tensor, ComplexTensor], dim=-1, keepdim=False
+) -> torch.Tensor:
     if not is_complex(c):
         raise TypeError("Input is not a complex tensor.")
-    return torch.sqrt((c.real ** 2 + c.imag ** 2).sum(dim=-1, keepdim=True) + EPS)
+    if is_torch_complex_tensor(c):
+        return torch.norm(c, dim=dim, keepdim=keepdim)
+    else:
+        if dim is None:
+            return torch.sqrt((c.real**2 + c.imag**2).sum() + EPS)
+        else:
+            return torch.sqrt(
+                (c.real**2 + c.imag**2).sum(dim=dim, keepdim=keepdim) + EPS
+            )
 
 
 def einsum(equation, *operands):
@@ -65,7 +90,13 @@ def einsum(equation, *operands):
         complex_module = FC if isinstance(operands[0], ComplexTensor) else torch
         return complex_module.einsum(equation, *operands)
     elif len(operands) != 2:
-        raise ValueError("0 or More than 2 operands are not supported.")
+        op0 = operands[0]
+        same_type = all(op.dtype == op0.dtype for op in operands[1:])
+        if same_type:
+            _einsum = FC.einsum if isinstance(op0, ComplexTensor) else torch.einsum
+            return _einsum(equation, *operands)
+        else:
+            raise ValueError("0 or More than 2 operands are not supported.")
     a, b = operands
     if isinstance(a, ComplexTensor) or isinstance(b, ComplexTensor):
         return FC.einsum(equation, a, b)
@@ -116,6 +147,12 @@ def matmul(
         return torch.matmul(a, b)
 
 
+def trace(a: Union[torch.Tensor, ComplexTensor]):
+    # NOTE (wangyou): until PyTorch 1.9.0, torch.trace does not
+    # support bacth processing. Use FC.trace() as fallback.
+    return FC.trace(a)
+
+
 def reverse(a: Union[torch.Tensor, ComplexTensor], dim=0):
     if isinstance(a, ComplexTensor):
         return FC.reverse(a, dim=dim)
@@ -130,7 +167,7 @@ def solve(b: Union[torch.Tensor, ComplexTensor], a: Union[torch.Tensor, ComplexT
     # mixed input with complex and real tensors.
     if isinstance(a, ComplexTensor) or isinstance(b, ComplexTensor):
         if isinstance(a, ComplexTensor) and isinstance(b, ComplexTensor):
-            return FC.solve(b, a)[0]
+            return FC.solve(b, a, return_LU=False)
         else:
             return matmul(inverse(a), b)
     elif is_torch_1_9_plus and (torch.is_complex(a) or torch.is_complex(b)):
@@ -143,3 +180,15 @@ def solve(b: Union[torch.Tensor, ComplexTensor], a: Union[torch.Tensor, ComplexT
             return torch.linalg.solve(a, b)
         else:
             return torch.solve(b, a)[0]
+
+
+def stack(seq: Sequence[Union[ComplexTensor, torch.Tensor]], *args, **kwargs):
+    if not isinstance(seq, (list, tuple)):
+        raise TypeError(
+            "stack(): argument 'tensors' (position 1) must be tuple of Tensors, "
+            "not Tensor"
+        )
+    if isinstance(seq[0], ComplexTensor):
+        return FC.stack(seq, *args, **kwargs)
+    else:
+        return torch.stack(seq, *args, **kwargs)
diff --git a/espnet2/enh/layers/complexnn.py b/espnet2/enh/layers/complexnn.py
new file mode 100644
index 00000000000..5b458ca0ac2
--- /dev/null
+++ b/espnet2/enh/layers/complexnn.py
@@ -0,0 +1,436 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class NavieComplexLSTM(nn.Module):
+    def __init__(
+        self,
+        input_size,
+        hidden_size,
+        projection_dim=None,
+        bidirectional=False,
+        batch_first=False,
+    ):
+        super(NavieComplexLSTM, self).__init__()
+        self.bidirectional = bidirectional
+        self.input_dim = input_size // 2
+        self.rnn_units = hidden_size // 2
+        self.real_lstm = nn.LSTM(
+            self.input_dim,
+            self.rnn_units,
+            num_layers=1,
+            bidirectional=bidirectional,
+            batch_first=False,
+        )
+        self.imag_lstm = nn.LSTM(
+            self.input_dim,
+            self.rnn_units,
+            num_layers=1,
+            bidirectional=bidirectional,
+            batch_first=False,
+        )
+        if bidirectional:
+            bidirectional = 2
+        else:
+            bidirectional = 1
+        if projection_dim is not None:
+            self.projection_dim = projection_dim // 2
+            self.r_trans = nn.Linear(
+                self.rnn_units * bidirectional, self.projection_dim
+            )
+            self.i_trans = nn.Linear(
+                self.rnn_units * bidirectional, self.projection_dim
+            )
+        else:
+            self.projection_dim = None
+
+    def forward(self, inputs):
+        if isinstance(inputs, list):
+            real, imag = inputs
+        elif isinstance(inputs, torch.Tensor):
+            real, imag = torch.chunk(inputs, -1)
+        r2r_out = self.real_lstm(real)[0]
+        r2i_out = self.imag_lstm(real)[0]
+        i2r_out = self.real_lstm(imag)[0]
+        i2i_out = self.imag_lstm(imag)[0]
+        real_out = r2r_out - i2i_out
+        imag_out = i2r_out + r2i_out
+        if self.projection_dim is not None:
+            real_out = self.r_trans(real_out)
+            imag_out = self.i_trans(imag_out)
+        return [real_out, imag_out]
+
+    def flatten_parameters(self):
+        self.imag_lstm.flatten_parameters()
+        self.real_lstm.flatten_parameters()
+
+
+def complex_cat(inputs, axis):
+
+    real, imag = [], []
+    for idx, data in enumerate(inputs):
+        r, i = torch.chunk(data, 2, axis)
+        real.append(r)
+        imag.append(i)
+    real = torch.cat(real, axis)
+    imag = torch.cat(imag, axis)
+    outputs = torch.cat([real, imag], axis)
+    return outputs
+
+
+class ComplexConv2d(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size=(1, 1),
+        stride=(1, 1),
+        padding=(0, 0),
+        dilation=1,
+        groups=1,
+        causal=True,
+        complex_axis=1,
+    ):
+        """ComplexConv2d.
+
+        in_channels: real+imag
+        out_channels: real+imag
+        kernel_size : input [B,C,D,T] kernel size in [D,T]
+        padding : input [B,C,D,T] padding in [D,T]
+        causal: if causal, will padding time dimension's left side,
+                otherwise both
+        """
+        super(ComplexConv2d, self).__init__()
+        self.in_channels = in_channels // 2
+        self.out_channels = out_channels // 2
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.causal = causal
+        self.groups = groups
+        self.dilation = dilation
+        self.complex_axis = complex_axis
+        self.real_conv = nn.Conv2d(
+            self.in_channels,
+            self.out_channels,
+            kernel_size,
+            self.stride,
+            padding=[self.padding[0], 0],
+            dilation=self.dilation,
+            groups=self.groups,
+        )
+        self.imag_conv = nn.Conv2d(
+            self.in_channels,
+            self.out_channels,
+            kernel_size,
+            self.stride,
+            padding=[self.padding[0], 0],
+            dilation=self.dilation,
+            groups=self.groups,
+        )
+
+        nn.init.normal_(self.real_conv.weight.data, std=0.05)
+        nn.init.normal_(self.imag_conv.weight.data, std=0.05)
+        nn.init.constant_(self.real_conv.bias, 0.0)
+        nn.init.constant_(self.imag_conv.bias, 0.0)
+
+    def forward(self, inputs):
+        if self.padding[1] != 0 and self.causal:
+            inputs = F.pad(inputs, [self.padding[1], 0, 0, 0])
+        else:
+            inputs = F.pad(inputs, [self.padding[1], self.padding[1], 0, 0])
+
+        if self.complex_axis == 0:
+            real = self.real_conv(inputs)
+            imag = self.imag_conv(inputs)
+            real2real, imag2real = torch.chunk(real, 2, self.complex_axis)
+            real2imag, imag2imag = torch.chunk(imag, 2, self.complex_axis)
+
+        else:
+            if isinstance(inputs, torch.Tensor):
+                real, imag = torch.chunk(inputs, 2, self.complex_axis)
+
+            real2real = self.real_conv(
+                real,
+            )
+            imag2imag = self.imag_conv(
+                imag,
+            )
+
+            real2imag = self.imag_conv(real)
+            imag2real = self.real_conv(imag)
+
+        real = real2real - imag2imag
+        imag = real2imag + imag2real
+        out = torch.cat([real, imag], self.complex_axis)
+
+        return out
+
+
+class ComplexConvTranspose2d(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size=(1, 1),
+        stride=(1, 1),
+        padding=(0, 0),
+        output_padding=(0, 0),
+        causal=False,
+        complex_axis=1,
+        groups=1,
+    ):
+        """ComplexConvTranspose2d.
+
+        in_channels: real+imag
+        out_channels: real+imag
+        """
+        super(ComplexConvTranspose2d, self).__init__()
+        self.in_channels = in_channels // 2
+        self.out_channels = out_channels // 2
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.output_padding = output_padding
+        self.groups = groups
+
+        self.real_conv = nn.ConvTranspose2d(
+            self.in_channels,
+            self.out_channels,
+            kernel_size,
+            self.stride,
+            padding=self.padding,
+            output_padding=output_padding,
+            groups=self.groups,
+        )
+        self.imag_conv = nn.ConvTranspose2d(
+            self.in_channels,
+            self.out_channels,
+            kernel_size,
+            self.stride,
+            padding=self.padding,
+            output_padding=output_padding,
+            groups=self.groups,
+        )
+        self.complex_axis = complex_axis
+
+        nn.init.normal_(self.real_conv.weight, std=0.05)
+        nn.init.normal_(self.imag_conv.weight, std=0.05)
+        nn.init.constant_(self.real_conv.bias, 0.0)
+        nn.init.constant_(self.imag_conv.bias, 0.0)
+
+    def forward(self, inputs):
+        if isinstance(inputs, torch.Tensor):
+            real, imag = torch.chunk(inputs, 2, self.complex_axis)
+        elif isinstance(inputs, tuple) or isinstance(inputs, list):
+            real = inputs[0]
+            imag = inputs[1]
+        if self.complex_axis == 0:
+            real = self.real_conv(inputs)
+            imag = self.imag_conv(inputs)
+            real2real, imag2real = torch.chunk(real, 2, self.complex_axis)
+            real2imag, imag2imag = torch.chunk(imag, 2, self.complex_axis)
+
+        else:
+            if isinstance(inputs, torch.Tensor):
+                real, imag = torch.chunk(inputs, 2, self.complex_axis)
+
+            real2real = self.real_conv(
+                real,
+            )
+            imag2imag = self.imag_conv(
+                imag,
+            )
+
+            real2imag = self.imag_conv(real)
+            imag2real = self.real_conv(imag)
+
+        real = real2real - imag2imag
+        imag = real2imag + imag2real
+
+        out = torch.cat([real, imag], self.complex_axis)
+
+        return out
+
+
+class ComplexBatchNorm(torch.nn.Module):
+    def __init__(
+        self,
+        num_features,
+        eps=1e-5,
+        momentum=0.1,
+        affine=True,
+        track_running_stats=True,
+        complex_axis=1,
+    ):
+        super(ComplexBatchNorm, self).__init__()
+        self.num_features = num_features // 2
+        self.eps = eps
+        self.momentum = momentum
+        self.affine = affine
+        self.track_running_stats = track_running_stats
+
+        self.complex_axis = complex_axis
+
+        if self.affine:
+            self.Wrr = torch.nn.Parameter(torch.Tensor(self.num_features))
+            self.Wri = torch.nn.Parameter(torch.Tensor(self.num_features))
+            self.Wii = torch.nn.Parameter(torch.Tensor(self.num_features))
+            self.Br = torch.nn.Parameter(torch.Tensor(self.num_features))
+            self.Bi = torch.nn.Parameter(torch.Tensor(self.num_features))
+        else:
+            self.register_parameter("Wrr", None)
+            self.register_parameter("Wri", None)
+            self.register_parameter("Wii", None)
+            self.register_parameter("Br", None)
+            self.register_parameter("Bi", None)
+
+        if self.track_running_stats:
+            self.register_buffer("RMr", torch.zeros(self.num_features))
+            self.register_buffer("RMi", torch.zeros(self.num_features))
+            self.register_buffer("RVrr", torch.ones(self.num_features))
+            self.register_buffer("RVri", torch.zeros(self.num_features))
+            self.register_buffer("RVii", torch.ones(self.num_features))
+            self.register_buffer(
+                "num_batches_tracked", torch.tensor(0, dtype=torch.long)
+            )
+        else:
+            self.register_parameter("RMr", None)
+            self.register_parameter("RMi", None)
+            self.register_parameter("RVrr", None)
+            self.register_parameter("RVri", None)
+            self.register_parameter("RVii", None)
+            self.register_parameter("num_batches_tracked", None)
+        self.reset_parameters()
+
+    def reset_running_stats(self):
+        if self.track_running_stats:
+            self.RMr.zero_()
+            self.RMi.zero_()
+            self.RVrr.fill_(1)
+            self.RVri.zero_()
+            self.RVii.fill_(1)
+            self.num_batches_tracked.zero_()
+
+    def reset_parameters(self):
+        self.reset_running_stats()
+        if self.affine:
+            self.Br.data.zero_()
+            self.Bi.data.zero_()
+            self.Wrr.data.fill_(1)
+            self.Wri.data.uniform_(-0.9, +0.9)  # W will be positive-definite
+            self.Wii.data.fill_(1)
+
+    def _check_input_dim(self, xr, xi):
+        assert xr.shape == xi.shape
+        assert xr.size(1) == self.num_features
+
+    def forward(self, inputs):
+
+        xr, xi = torch.chunk(inputs, 2, axis=self.complex_axis)
+        exponential_average_factor = 0.0
+
+        if self.training and self.track_running_stats:
+            self.num_batches_tracked += 1
+            if self.momentum is None:  # use cumulative moving average
+                exponential_average_factor = 1.0 / self.num_batches_tracked.item()
+            else:  # use exponential moving average
+                exponential_average_factor = self.momentum
+
+        # NOTE: The precise meaning of the "training flag" is:
+        #       True:  Normalize using batch   statistics, update running statistics
+        #              if they are being collected.
+        #       False: Normalize using running statistics, ignore batch   statistics.
+        training = self.training or not self.track_running_stats
+        redux = [i for i in reversed(range(xr.dim())) if i != 1]
+        vdim = [1] * xr.dim()
+        vdim[1] = xr.size(1)
+
+        # Mean M Computation and Centering
+        # Includes running mean update if training and running.
+        if training:
+            Mr, Mi = xr, xi
+            for d in redux:
+                Mr = Mr.mean(d, keepdim=True)
+                Mi = Mi.mean(d, keepdim=True)
+            if self.track_running_stats:
+                self.RMr.lerp_(Mr.squeeze(), exponential_average_factor)
+                self.RMi.lerp_(Mi.squeeze(), exponential_average_factor)
+        else:
+            Mr = self.RMr.view(vdim)
+            Mi = self.RMi.view(vdim)
+        xr, xi = xr - Mr, xi - Mi
+
+        # Variance Matrix V Computation
+        # Includes epsilon numerical stabilizer/Tikhonov regularizer.
+        # Includes running variance update if training and running.
+        if training:
+            Vrr = xr * xr
+            Vri = xr * xi
+            Vii = xi * xi
+            for d in redux:
+                Vrr = Vrr.mean(d, keepdim=True)
+                Vri = Vri.mean(d, keepdim=True)
+                Vii = Vii.mean(d, keepdim=True)
+            if self.track_running_stats:
+                self.RVrr.lerp_(Vrr.squeeze(), exponential_average_factor)
+                self.RVri.lerp_(Vri.squeeze(), exponential_average_factor)
+                self.RVii.lerp_(Vii.squeeze(), exponential_average_factor)
+        else:
+            Vrr = self.RVrr.view(vdim)
+            Vri = self.RVri.view(vdim)
+            Vii = self.RVii.view(vdim)
+        Vrr = Vrr + self.eps
+        Vri = Vri
+        Vii = Vii + self.eps
+
+        # Matrix Inverse Square Root U = V^-0.5
+        # sqrt of a 2x2 matrix,
+        # - https://en.wikipedia.org/wiki/Square_root_of_a_2_by_2_matrix
+        tau = Vrr + Vii
+        delta = torch.addcmul(Vrr * Vii, -1, Vri, Vri)
+        s = delta.sqrt()
+        t = (tau + 2 * s).sqrt()
+
+        # matrix inverse, http://mathworld.wolfram.com/MatrixInverse.html
+        rst = (s * t).reciprocal()
+        Urr = (s + Vii) * rst
+        Uii = (s + Vrr) * rst
+        Uri = (-Vri) * rst
+
+        # Optionally left-multiply U by affine weights W to produce combined
+        # weights Z, left-multiply the inputs by Z, then optionally bias them.
+        #
+        # y = Zx + B
+        # y = WUx + B
+        # y = [Wrr Wri][Urr Uri] [xr] + [Br]
+        #     [Wir Wii][Uir Uii] [xi]   [Bi]
+        if self.affine:
+            Wrr, Wri, Wii = (
+                self.Wrr.view(vdim),
+                self.Wri.view(vdim),
+                self.Wii.view(vdim),
+            )
+            Zrr = (Wrr * Urr) + (Wri * Uri)
+            Zri = (Wrr * Uri) + (Wri * Uii)
+            Zir = (Wri * Urr) + (Wii * Uri)
+            Zii = (Wri * Uri) + (Wii * Uii)
+        else:
+            Zrr, Zri, Zir, Zii = Urr, Uri, Uri, Uii
+
+        yr = (Zrr * xr) + (Zri * xi)
+        yi = (Zir * xr) + (Zii * xi)
+
+        if self.affine:
+            yr = yr + self.Br.view(vdim)
+            yi = yi + self.Bi.view(vdim)
+
+        outputs = torch.cat([yr, yi], self.complex_axis)
+        return outputs
+
+    def extra_repr(self):
+        return (
+            "{num_features}, eps={eps}, momentum={momentum}, affine={affine}, "
+            "track_running_stats={track_running_stats}".format(**self.__dict__)
+        )
diff --git a/espnet2/enh/layers/conv_utils.py b/espnet2/enh/layers/conv_utils.py
new file mode 100644
index 00000000000..e3ca44083a6
--- /dev/null
+++ b/espnet2/enh/layers/conv_utils.py
@@ -0,0 +1,57 @@
+# noqa: E501 ported from https://discuss.pytorch.org/t/utility-function-for-calculating-the-shape-of-a-conv-output/11173/7
+import math
+
+
+def num2tuple(num):
+    return num if isinstance(num, tuple) else (num, num)
+
+
+def conv2d_output_shape(h_w, kernel_size=1, stride=1, pad=0, dilation=1):
+    h_w, kernel_size, stride, pad, dilation = (
+        num2tuple(h_w),
+        num2tuple(kernel_size),
+        num2tuple(stride),
+        num2tuple(pad),
+        num2tuple(dilation),
+    )
+    pad = num2tuple(pad[0]), num2tuple(pad[1])
+
+    h = math.floor(
+        (h_w[0] + sum(pad[0]) - dilation[0] * (kernel_size[0] - 1) - 1) / stride[0] + 1
+    )
+    w = math.floor(
+        (h_w[1] + sum(pad[1]) - dilation[1] * (kernel_size[1] - 1) - 1) / stride[1] + 1
+    )
+
+    return h, w
+
+
+def convtransp2d_output_shape(
+    h_w, kernel_size=1, stride=1, pad=0, dilation=1, out_pad=0
+):
+    h_w, kernel_size, stride, pad, dilation, out_pad = (
+        num2tuple(h_w),
+        num2tuple(kernel_size),
+        num2tuple(stride),
+        num2tuple(pad),
+        num2tuple(dilation),
+        num2tuple(out_pad),
+    )
+    pad = num2tuple(pad[0]), num2tuple(pad[1])
+
+    h = (
+        (h_w[0] - 1) * stride[0]
+        - sum(pad[0])
+        + dilation[0] * (kernel_size[0] - 1)
+        + out_pad[0]
+        + 1
+    )
+    w = (
+        (h_w[1] - 1) * stride[1]
+        - sum(pad[1])
+        + dilation[1] * (kernel_size[1] - 1)
+        + out_pad[1]
+        + 1
+    )
+
+    return h, w
diff --git a/espnet2/enh/layers/dc_crn.py b/espnet2/enh/layers/dc_crn.py
new file mode 100644
index 00000000000..ba781a4cd45
--- /dev/null
+++ b/espnet2/enh/layers/dc_crn.py
@@ -0,0 +1,508 @@
+# Implementation of Densely-connected convolutional recurrent network (DC-CRN)
+# [1] Tan et al. "Deep Learning Based Real-Time Speech Enhancement for Dual-Microphone
+#     Mobile Phones"
+#     https://web.cse.ohio-state.edu/~wang.77/papers/TZW.taslp21.pdf
+
+
+from typing import List
+
+import torch
+import torch.nn as nn
+
+from espnet2.enh.layers.conv_utils import conv2d_output_shape
+from espnet2.enh.layers.conv_utils import convtransp2d_output_shape
+
+
+class GLSTM(nn.Module):
+    def __init__(
+        self, hidden_size=1024, groups=2, layers=2, bidirectional=False, rearrange=False
+    ):
+        """Grouped LSTM.
+
+        Reference:
+            Efficient Sequence Learning with Group Recurrent Networks; Gao et al., 2018
+
+        Args:
+            hidden_size (int): total hidden size of all LSTMs in each grouped LSTM layer
+                i.e., hidden size of each LSTM is `hidden_size // groups`
+            groups (int): number of LSTMs in each grouped LSTM layer
+            layers (int): number of grouped LSTM layers
+            bidirectional (bool): whether to use BLSTM or unidirectional LSTM
+            rearrange (bool): whether to apply the rearrange operation after each
+                grouped LSTM layer
+        """
+        super().__init__()
+
+        assert hidden_size % groups == 0, (hidden_size, groups)
+        hidden_size_t = hidden_size // groups
+        if bidirectional:
+            assert hidden_size_t % 2 == 0, hidden_size_t
+
+        self.groups = groups
+        self.layers = layers
+        self.rearrange = rearrange
+
+        self.lstm_list = nn.ModuleList()
+        self.ln = nn.ModuleList([nn.LayerNorm(hidden_size) for _ in range(layers)])
+        for layer in range(layers):
+            self.lstm_list.append(
+                nn.ModuleList(
+                    [
+                        nn.LSTM(
+                            hidden_size_t,
+                            hidden_size_t // 2 if bidirectional else hidden_size_t,
+                            1,
+                            batch_first=True,
+                            bidirectional=bidirectional,
+                        )
+                        for _ in range(groups)
+                    ]
+                )
+            )
+
+    def forward(self, x):
+        """Grouped LSTM forward.
+
+        Args:
+            x (torch.Tensor): (B, C, T, D)
+        Returns:
+            out (torch.Tensor): (B, C, T, D)
+        """
+        out = x
+        out = out.transpose(1, 2).contiguous()
+        B, T = out.size(0), out.size(1)
+        out = out.view(B, T, -1).contiguous()
+
+        out = torch.chunk(out, self.groups, dim=-1)
+        out = torch.stack(
+            [self.lstm_list[0][i](out[i])[0] for i in range(self.groups)], dim=-1
+        )
+        out = torch.flatten(out, start_dim=-2, end_dim=-1)
+        out = self.ln[0](out)
+
+        for layer in range(1, self.layers):
+            if self.rearrange:
+                out = (
+                    out.reshape(B, T, self.groups, -1)
+                    .transpose(-1, -2)
+                    .contiguous()
+                    .view(B, T, -1)
+                )
+            out = torch.chunk(out, self.groups, dim=-1)
+            out = torch.cat(
+                [self.lstm_list[layer][i](out[i])[0] for i in range(self.groups)],
+                dim=-1,
+            )
+            out = self.ln[layer](out)
+
+        out = out.view(out.size(0), out.size(1), x.size(1), -1).contiguous()
+        out = out.transpose(1, 2).contiguous()
+
+        return out
+
+
+class GluConv2d(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride, padding=0):
+        """Conv2d with Gated Linear Units (GLU).
+
+        Input and output shapes are the same as regular Conv2d layers.
+
+        Reference: Section III-B in [1]
+
+        Args:
+            in_channels (int): number of input channels
+            out_channels (int): number of output channels
+            kernel_size (int/tuple): kernel size in Conv2d
+            stride (int/tuple): stride size in Conv2d
+            padding (int/tuple): padding size in Conv2d
+        """
+        super().__init__()
+        self.conv1 = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+        )
+        self.conv2 = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+        )
+
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, x):
+        """ConvGLU forward.
+
+        Args:
+            x (torch.Tensor): (B, C_in, H_in, W_in)
+        Returns:
+            out (torch.Tensor): (B, C_out, H_out, W_out)
+        """
+        out = self.conv1(x)
+        gate = self.sigmoid(self.conv2(x))
+        return out * gate
+
+
+class GluConvTranspose2d(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding=0,
+        output_padding=(0, 0),
+    ):
+        """ConvTranspose2d with Gated Linear Units (GLU).
+
+        Input and output shapes are the same as regular ConvTranspose2d layers.
+
+        Reference: Section III-B in [1]
+
+        Args:
+            in_channels (int): number of input channels
+            out_channels (int): number of output channels
+            kernel_size (int/tuple): kernel size in ConvTranspose2d
+            stride (int/tuple): stride size in ConvTranspose2d
+            padding (int/tuple): padding size in ConvTranspose2d
+            output_padding (int/tuple): Additional size added to one side of each
+                dimension in the output shape
+        """
+        super().__init__()
+        self.deconv1 = nn.ConvTranspose2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+        )
+        self.deconv2 = nn.ConvTranspose2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+        )
+
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, x):
+        """DeconvGLU forward.
+
+        Args:
+            x (torch.Tensor): (B, C_in, H_in, W_in)
+        Returns:
+            out (torch.Tensor): (B, C_out, H_out, W_out)
+        """
+        out = self.deconv1(x)
+        gate = self.sigmoid(self.deconv2(x))
+        return out * gate
+
+
+class DenselyConnectedBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        hid_channels=8,
+        kernel_size=(1, 3),
+        padding=(0, 1),
+        last_kernel_size=(1, 4),  # use (1, 4) to alleviate the checkerboard artifacts
+        last_stride=(1, 2),
+        last_padding=(0, 1),
+        last_output_padding=(0, 0),
+        layers=5,
+        transposed=False,
+    ):
+        """Densely-Connected Convolutional Block.
+
+        Args:
+            in_channels (int): number of input channels
+            out_channels (int): number of output channels
+            hid_channels (int): number of output channels in intermediate Conv layers
+            kernel_size (tuple): kernel size for all but the last Conv layers
+            padding (tuple): padding for all but the last Conv layers
+            last_kernel_size (tuple): kernel size for the last GluConv layer
+            last_stride (tuple): stride for the last GluConv layer
+            last_padding (tuple): padding for the last GluConv layer
+            last_output_padding (tuple): output padding for the last GluConvTranspose2d
+                 (only used when `transposed=True`)
+            layers (int): total number of Conv layers
+            transposed (bool): True to use GluConvTranspose2d in the last layer
+                               False to use GluConv2d in the last layer
+        """
+        super().__init__()
+
+        assert layers > 1, layers
+        self.conv = nn.ModuleList()
+        in_channel = in_channels
+        # here T=42 and D=127 are random integers that should not be changed after Conv
+        T, D = 42, 127
+        hidden_sizes = [127]
+        for _ in range(layers - 1):
+            self.conv.append(
+                nn.Sequential(
+                    nn.Conv2d(
+                        in_channel,
+                        hid_channels,
+                        kernel_size=kernel_size,
+                        stride=(1, 1),
+                        padding=padding,
+                    ),
+                    nn.BatchNorm2d(hid_channels),
+                    nn.ELU(inplace=True),
+                )
+            )
+            in_channel = in_channel + hid_channels
+            # make sure the last two dimensions will not be changed after this layer
+            tdim, hdim = conv2d_output_shape(
+                (T, D),
+                kernel_size=kernel_size,
+                stride=(1, 1),
+                pad=padding,
+            )
+            hidden_sizes.append(hdim)
+            assert tdim == T and hdim == D, (tdim, hdim, T, D)
+
+        if transposed:
+            self.conv.append(
+                GluConvTranspose2d(
+                    in_channel,
+                    out_channels,
+                    kernel_size=last_kernel_size,
+                    stride=last_stride,
+                    padding=last_padding,
+                    output_padding=last_output_padding,
+                )
+            )
+        else:
+            self.conv.append(
+                GluConv2d(
+                    in_channel,
+                    out_channels,
+                    kernel_size=last_kernel_size,
+                    stride=last_stride,
+                    padding=last_padding,
+                )
+            )
+
+    def forward(self, input):
+        """DenselyConnectedBlock forward.
+
+        Args:
+            input (torch.Tensor): (B, C, T_in, F_in)
+        Returns:
+            out (torch.Tensor): (B, C, T_out, F_out)
+        """
+        out = self.conv[0](input)
+        outputs = [input, out]
+        num_layers = len(self.conv)
+        for idx, layer in enumerate(self.conv[1:]):
+            out = layer(torch.cat(outputs, dim=1))
+            if idx < num_layers - 1:
+                outputs.append(out)
+        return out
+
+
+class DC_CRN(nn.Module):
+    def __init__(
+        self,
+        input_dim,
+        input_channels: List = [2, 16, 32, 64, 128, 256],
+        enc_hid_channels=8,
+        enc_kernel_size=(1, 3),
+        enc_padding=(0, 1),
+        enc_last_kernel_size=(1, 4),
+        enc_last_stride=(1, 2),
+        enc_last_padding=(0, 1),
+        enc_layers=5,
+        skip_last_kernel_size=(1, 3),
+        skip_last_stride=(1, 1),
+        skip_last_padding=(0, 1),
+        glstm_groups=2,
+        glstm_layers=2,
+        glstm_bidirectional=False,
+        glstm_rearrange=False,
+        output_channels=2,
+    ):
+        """Densely-Connected Convolutional Recurrent Network (DC-CRN).
+
+        Reference: Fig. 3 and Section III-B in [1]
+
+        Args:
+            input_dim (int): input feature dimension
+            input_channels (list): number of input channels for the stacked
+                DenselyConnectedBlock layers
+                Its length should be (`number of DenselyConnectedBlock layers`).
+                It is recommended to use even number of channels to avoid AssertError
+                when `glstm_bidirectional=True`.
+            enc_hid_channels (int): common number of intermediate channels for all
+                DenselyConnectedBlock of the encoder
+            enc_kernel_size (tuple): common kernel size for all DenselyConnectedBlock
+                of the encoder
+            enc_padding (tuple): common padding for all DenselyConnectedBlock
+                of the encoder
+            enc_last_kernel_size (tuple): common kernel size for the last Conv layer
+                in all DenselyConnectedBlock of the encoder
+            enc_last_stride (tuple): common stride for the last Conv layer in all
+                DenselyConnectedBlock of the encoder
+            enc_last_padding (tuple): common padding for the last Conv layer in all
+                DenselyConnectedBlock of the encoder
+            enc_layers (int): common total number of Conv layers for all
+                DenselyConnectedBlock layers of the encoder
+            skip_last_kernel_size (tuple): common kernel size for the last Conv layer
+                in all DenselyConnectedBlock of the skip pathways
+            skip_last_stride (tuple): common stride for the last Conv layer in all
+                DenselyConnectedBlock of the skip pathways
+            skip_last_padding (tuple): common padding for the last Conv layer in all
+                DenselyConnectedBlock of the skip pathways
+            glstm_groups (int): number of groups in each Grouped LSTM layer
+            glstm_layers (int): number of Grouped LSTM layers
+            glstm_bidirectional (bool): whether to use BLSTM or unidirectional LSTM
+                in Grouped LSTM layers
+            glstm_rearrange (bool): whether to apply the rearrange operation after each
+                grouped LSTM layer
+            output_channels (int): number of output channels (must be an even number to
+                recover both real and imaginary parts)
+        """
+        super().__init__()
+
+        assert output_channels % 2 == 0, output_channels
+        self.conv_enc = nn.ModuleList()
+        # here T=42 is a random integer that should not be changed after Conv
+        T = 42
+        hidden_sizes = [input_dim]
+        hdim = input_dim
+        for i in range(1, len(input_channels)):
+            self.conv_enc.append(
+                DenselyConnectedBlock(
+                    in_channels=input_channels[i - 1],
+                    out_channels=input_channels[i],
+                    hid_channels=enc_hid_channels,
+                    kernel_size=enc_kernel_size,
+                    padding=enc_padding,
+                    last_kernel_size=enc_last_kernel_size,
+                    last_stride=enc_last_stride,
+                    last_padding=enc_last_padding,
+                    layers=enc_layers,
+                    transposed=False,
+                )
+            )
+            tdim, hdim = conv2d_output_shape(
+                (T, hdim),
+                kernel_size=enc_last_kernel_size,
+                stride=enc_last_stride,
+                pad=enc_last_padding,
+            )
+            hidden_sizes.append(hdim)
+            assert tdim == T, (tdim, hdim)
+
+        hs = hdim * input_channels[-1]
+        assert hs >= glstm_groups, (hs, glstm_groups)
+        self.glstm = GLSTM(
+            hidden_size=hs,
+            groups=glstm_groups,
+            layers=glstm_layers,
+            bidirectional=glstm_bidirectional,
+            rearrange=glstm_rearrange,
+        )
+
+        self.skip_pathway = nn.ModuleList()
+        self.deconv_dec = nn.ModuleList()
+        for i in range(len(input_channels) - 1, 0, -1):
+            self.skip_pathway.append(
+                DenselyConnectedBlock(
+                    in_channels=input_channels[i],
+                    out_channels=input_channels[i],
+                    hid_channels=enc_hid_channels,
+                    kernel_size=enc_kernel_size,
+                    padding=enc_padding,
+                    last_kernel_size=skip_last_kernel_size,
+                    last_stride=skip_last_stride,
+                    last_padding=skip_last_padding,
+                    layers=enc_layers,
+                    transposed=False,
+                )
+            )
+            # make sure the last two dimensions will not be changed after this layer
+            enc_hdim = hidden_sizes[i]
+            tdim, hdim = conv2d_output_shape(
+                (T, enc_hdim),
+                kernel_size=skip_last_kernel_size,
+                stride=skip_last_stride,
+                pad=skip_last_padding,
+            )
+            assert tdim == T and hdim == enc_hdim, (tdim, hdim, T, enc_hdim)
+
+            if i != 1:
+                out_ch = input_channels[i - 1]
+            else:
+                out_ch = output_channels
+            # make sure the last but one dimension will not be changed after this layer
+            tdim, hdim = convtransp2d_output_shape(
+                (T, enc_hdim),
+                kernel_size=enc_last_kernel_size,
+                stride=enc_last_stride,
+                pad=enc_last_padding,
+            )
+            assert tdim == T, (tdim, hdim)
+            hpadding = hidden_sizes[i - 1] - hdim
+            assert hpadding >= 0, (hidden_sizes[i - 1], hdim)
+            self.deconv_dec.append(
+                DenselyConnectedBlock(
+                    in_channels=input_channels[i] * 2,
+                    out_channels=out_ch,
+                    hid_channels=enc_hid_channels,
+                    kernel_size=enc_kernel_size,
+                    padding=enc_padding,
+                    last_kernel_size=enc_last_kernel_size,
+                    last_stride=enc_last_stride,
+                    last_padding=enc_last_padding,
+                    last_output_padding=(0, hpadding),
+                    layers=enc_layers,
+                    transposed=True,
+                )
+            )
+
+        self.fc_real = nn.Linear(in_features=input_dim, out_features=input_dim)
+        self.fc_imag = nn.Linear(in_features=input_dim, out_features=input_dim)
+
+    def forward(self, x):
+        """DC-CRN forward.
+
+        Args:
+            x (torch.Tensor): Concatenated real and imaginary spectrum features
+                (B, input_channels[0], T, F)
+        Returns:
+            out (torch.Tensor): (B, 2, output_channels, T, F)
+        """
+        out = x
+        conv_out = []
+        for idx, layer in enumerate(self.conv_enc):
+            out = layer(out)
+            conv_out.append(out)
+
+        num_out = len(conv_out)
+        out = self.glstm(conv_out[-1])
+        res = self.skip_pathway[0](conv_out[-1])
+        out = torch.cat((out, res), dim=1)
+
+        for idx in range(len(self.deconv_dec) - 1):
+            deconv_out = self.deconv_dec[idx](out)
+            res = self.skip_pathway[idx + 1](conv_out[num_out - idx - 2])
+            out = torch.cat((deconv_out, res), dim=1)
+        out = self.deconv_dec[-1](out)
+
+        dout_real, dout_imag = torch.chunk(out, 2, dim=1)
+
+        out_real = self.fc_real(dout_real)
+        out_imag = self.fc_imag(dout_imag)
+        out = torch.stack([out_real, out_imag], dim=1)
+
+        return out
diff --git a/espnet2/enh/layers/dnn_beamformer.py b/espnet2/enh/layers/dnn_beamformer.py
index 5916c407cbe..40b264dcea9 100644
--- a/espnet2/enh/layers/dnn_beamformer.py
+++ b/espnet2/enh/layers/dnn_beamformer.py
@@ -1,23 +1,30 @@
 """DNN beamformer module."""
 from distutils.version import LooseVersion
 from typing import List
+from typing import Optional
 from typing import Tuple
 from typing import Union
 
 import logging
 import torch
 from torch.nn import functional as F
-from torch_complex import functional as FC
 from torch_complex.tensor import ComplexTensor
 
 from espnet2.enh.layers.beamformer import apply_beamforming_vector
-from espnet2.enh.layers.beamformer import get_covariances
+from espnet2.enh.layers.beamformer import blind_analytic_normalization
+from espnet2.enh.layers.beamformer import get_gev_vector
+from espnet2.enh.layers.beamformer import get_lcmv_vector_with_rtf
 from espnet2.enh.layers.beamformer import get_mvdr_vector
 from espnet2.enh.layers.beamformer import get_mvdr_vector_with_rtf
-from espnet2.enh.layers.beamformer import get_power_spectral_density_matrix
+from espnet2.enh.layers.beamformer import get_mwf_vector
+from espnet2.enh.layers.beamformer import get_rank1_mwf_vector
+from espnet2.enh.layers.beamformer import get_rtf_matrix
+from espnet2.enh.layers.beamformer import get_sdw_mwf_vector
 from espnet2.enh.layers.beamformer import get_WPD_filter_v2
 from espnet2.enh.layers.beamformer import get_WPD_filter_with_rtf
 from espnet2.enh.layers.beamformer import perform_WPD_filtering
+from espnet2.enh.layers.beamformer import prepare_beamformer_stats
+from espnet2.enh.layers.complex_utils import stack
 from espnet2.enh.layers.complex_utils import to_double
 from espnet2.enh.layers.complex_utils import to_float
 from espnet2.enh.layers.mask_estimator import MaskEstimator
@@ -38,6 +45,25 @@
     # Weighted Power minimization Distortionless response beamformer
     "wpd",  # RTF-based formula
     "wpd_souden",  # Souden's solution
+    # Multi-channel Wiener Filter (MWF) and weighted MWF
+    "mwf",
+    "wmwf",
+    # Speech Distortion Weighted (SDW) MWF
+    "sdw_mwf",
+    # Rank-1 MWF
+    "r1mwf",
+    # Linearly Constrained Minimum Variance beamformer
+    "lcmv",
+    # Linearly Constrained Minimum Power beamformer
+    "lcmp",
+    # weighted Linearly Constrained Minimum Power beamformer
+    "wlcmp",
+    # Generalized Eigenvalue beamformer
+    "gev",
+    "gev_ban",  # with blind analytic normalization (BAN) post-filtering
+    # time-frequency-bin-wise switching (TFS) MVDR beamformer
+    "mvdr_tfs",
+    "mvdr_tfs_souden",
 )
 
 
@@ -65,6 +91,7 @@ def __init__(
         ref_channel: int = -1,
         beamformer_type: str = "mvdr_souden",
         rtf_iterations: int = 2,
+        mwf_mu: float = 1.0,
         eps: float = 1e-6,
         diagonal_loading: bool = True,
         diag_eps: float = 1e-7,
@@ -126,6 +153,8 @@ def __init__(
             assert rtf_iterations >= 2, rtf_iterations
         # number of iterations in power method for estimating the RTF
         self.rtf_iterations = rtf_iterations
+        # noise suppression weight in SDW-MWF
+        self.mwf_mu = mwf_mu
 
         assert btaps >= 0 and bdelay >= 0, (btaps, bdelay)
         self.btaps = btaps
@@ -141,7 +170,8 @@ def forward(
         self,
         data: Union[torch.Tensor, ComplexTensor],
         ilens: torch.LongTensor,
-        powers: Union[List[torch.Tensor], None] = None,
+        powers: Optional[List[torch.Tensor]] = None,
+        oracle_masks: Optional[List[torch.Tensor]] = None,
     ) -> Tuple[Union[torch.Tensor, ComplexTensor], torch.LongTensor, torch.Tensor]:
         """DNN_Beamformer forward function.
 
@@ -155,117 +185,22 @@ def forward(
             data (torch.complex64/ComplexTensor): (B, T, C, F)
             ilens (torch.Tensor): (B,)
             powers (List[torch.Tensor] or None): used for wMPDR or WPD (B, F, T)
+            oracle_masks (List[torch.Tensor] or None): oracle masks (B, F, C, T)
+                if not None, oracle_masks will be used instead of self.mask
         Returns:
             enhanced (torch.complex64/ComplexTensor): (B, T, F)
             ilens (torch.Tensor): (B,)
             masks (torch.Tensor): (B, T, C, F)
         """
-
-        def apply_beamforming(data, ilens, psd_n, psd_speech, psd_distortion=None):
-            """Beamforming with the provided statistics.
-
-            Args:
-                data (torch.complex64/ComplexTensor): (B, F, C, T)
-                ilens (torch.Tensor): (B,)
-                psd_n (torch.complex64/ComplexTensor):
-                    Noise covariance matrix for MVDR (B, F, C, C)
-                    Observation covariance matrix for MPDR/wMPDR (B, F, C, C)
-                    Stacked observation covariance for WPD (B,F,(btaps+1)*C,(btaps+1)*C)
-                psd_speech (torch.complex64/ComplexTensor):
-                    Speech covariance matrix (B, F, C, C)
-                psd_distortion (torch.complex64/ComplexTensor):
-                    Noise covariance matrix (B, F, C, C)
-            Return:
-                enhanced (torch.complex64/ComplexTensor): (B, F, T)
-                ws (torch.complex64/ComplexTensor): (B, F) or (B, F, (btaps+1)*C)
-            """
-            # u: (B, C)
-            if self.ref_channel < 0:
-                u, _ = self.ref(psd_speech.to(dtype=data.dtype), ilens)
-                u = u.double()
-            else:
-                if self.beamformer_type.endswith("_souden"):
-                    # (optional) Create onehot vector for fixed reference microphone
-                    u = torch.zeros(
-                        *(data.size()[:-3] + (data.size(-2),)),
-                        device=data.device,
-                        dtype=torch.double
-                    )
-                    u[..., self.ref_channel].fill_(1)
-                else:
-                    # for simplifying computation in RTF-based beamforming
-                    u = self.ref_channel
-
-            if self.beamformer_type in ("mvdr", "mpdr", "wmpdr"):
-                ws = get_mvdr_vector_with_rtf(
-                    to_double(psd_n),
-                    to_double(psd_speech),
-                    to_double(psd_distortion),
-                    iterations=self.rtf_iterations,
-                    reference_vector=u,
-                    normalize_ref_channel=self.ref_channel,
-                    use_torch_solver=self.use_torch_solver,
-                    diagonal_loading=self.diagonal_loading,
-                    diag_eps=self.diag_eps,
-                )
-                enhanced = apply_beamforming_vector(ws, to_double(data))
-            elif self.beamformer_type in ("mpdr_souden", "mvdr_souden", "wmpdr_souden"):
-                ws = get_mvdr_vector(
-                    to_double(psd_speech),
-                    to_double(psd_n),
-                    u,
-                    use_torch_solver=self.use_torch_solver,
-                    diagonal_loading=self.diagonal_loading,
-                    diag_eps=self.diag_eps,
-                )
-                enhanced = apply_beamforming_vector(ws, to_double(data))
-            elif self.beamformer_type == "wpd":
-                ws = get_WPD_filter_with_rtf(
-                    to_double(psd_n),
-                    to_double(psd_speech),
-                    to_double(psd_distortion),
-                    iterations=self.rtf_iterations,
-                    reference_vector=u,
-                    normalize_ref_channel=self.ref_channel,
-                    use_torch_solver=self.use_torch_solver,
-                    diagonal_loading=self.diagonal_loading,
-                    diag_eps=self.diag_eps,
-                )
-                enhanced = perform_WPD_filtering(
-                    ws, to_double(data), self.bdelay, self.btaps
-                )
-            elif self.beamformer_type == "wpd_souden":
-                ws = get_WPD_filter_v2(
-                    to_double(psd_speech),
-                    to_double(psd_n),
-                    u,
-                    diagonal_loading=self.diagonal_loading,
-                    diag_eps=self.diag_eps,
-                )
-                enhanced = perform_WPD_filtering(
-                    ws, to_double(data), self.bdelay, self.btaps
-                )
-            else:
-                raise ValueError(
-                    "Not supporting beamformer_type={}".format(self.beamformer_type)
-                )
-
-            return enhanced.to(dtype=data.dtype), ws.to(dtype=data.dtype)
-
-        if isinstance(data, ComplexTensor):
-            complex_wrapper = FC
-        elif is_torch_1_9_plus and torch.is_complex(data):
-            complex_wrapper = torch
-        else:
-            raise ValueError(
-                "Please update your PyTorch version to 1.8+ for compelx support."
-            )
         # data (B, T, C, F) -> (B, F, C, T)
         data = data.permute(0, 3, 2, 1)
         data_d = to_double(data)
 
         # mask: [(B, F, C, T)]
-        masks, _ = self.mask(data, ilens)
+        if oracle_masks is not None:
+            masks = oracle_masks
+        else:
+            masks, _ = self.mask(data, ilens)
         assert self.nmask == len(masks), len(masks)
         # floor masks to increase numerical stability
         if self.mask_flooring:
@@ -280,72 +215,40 @@ def apply_beamforming(data, ilens, psd_n, psd_speech, psd_distortion=None):
                 mask_speech = masks[0]
                 mask_noise = 1 - mask_speech
 
-            if self.beamformer_type.startswith(
-                "wmpdr"
-            ) or self.beamformer_type.startswith("wpd"):
-                if powers is None:
-                    power_input = data_d.real ** 2 + data_d.imag ** 2
-                    # Averaging along the channel axis: (..., C, T) -> (..., T)
-                    powers = (power_input * mask_speech.double()).mean(dim=-2)
-                else:
-                    assert len(powers) == 1, len(powers)
-                    powers = powers[0]
-                inverse_power = 1 / torch.clamp(powers, min=self.eps)
-
-            psd_speech = get_power_spectral_density_matrix(data_d, mask_speech.double())
-            if mask_noise is not None and (
-                self.beamformer_type == "mvdr_souden"
-                or not self.beamformer_type.endswith("_souden")
-            ):
-                # MVDR or other RTF-based formulas
-                psd_noise = get_power_spectral_density_matrix(
-                    data_d, mask_noise.double()
-                )
-            if self.beamformer_type == "mvdr":
-                enhanced, ws = apply_beamforming(
-                    data, ilens, psd_noise, psd_speech, psd_distortion=psd_noise
-                )
-            elif self.beamformer_type == "mvdr_souden":
-                enhanced, ws = apply_beamforming(data, ilens, psd_noise, psd_speech)
-            elif self.beamformer_type == "mpdr":
-                psd_observed = complex_wrapper.einsum(
-                    "...ct,...et->...ce", [data_d, data_d.conj()]
-                )
-                enhanced, ws = apply_beamforming(
-                    data, ilens, psd_observed, psd_speech, psd_distortion=psd_noise
-                )
-            elif self.beamformer_type == "mpdr_souden":
-                psd_observed = complex_wrapper.einsum(
-                    "...ct,...et->...ce", [data_d, data_d.conj()]
-                )
-                enhanced, ws = apply_beamforming(data, ilens, psd_observed, psd_speech)
-            elif self.beamformer_type == "wmpdr":
-                psd_observed = complex_wrapper.einsum(
-                    "...ct,...et->...ce",
-                    [data_d * inverse_power[..., None, :], data_d.conj()],
-                )
-                enhanced, ws = apply_beamforming(
-                    data, ilens, psd_observed, psd_speech, psd_distortion=psd_noise
-                )
-            elif self.beamformer_type == "wmpdr_souden":
-                psd_observed = complex_wrapper.einsum(
-                    "...ct,...et->...ce",
-                    [data_d * inverse_power[..., None, :], data_d.conj()],
-                )
-                enhanced, ws = apply_beamforming(data, ilens, psd_observed, psd_speech)
-            elif self.beamformer_type == "wpd":
-                psd_observed_bar = get_covariances(
-                    data_d, inverse_power, self.bdelay, self.btaps, get_vector=False
-                )
-                enhanced, ws = apply_beamforming(
-                    data, ilens, psd_observed_bar, psd_speech, psd_distortion=psd_noise
-                )
-            elif self.beamformer_type == "wpd_souden":
-                psd_observed_bar = get_covariances(
-                    data_d, inverse_power, self.bdelay, self.btaps, get_vector=False
+            if self.beamformer_type in ("lcmv", "lcmp", "wlcmp"):
+                raise NotImplementedError("Single source is not supported yet")
+            beamformer_stats = prepare_beamformer_stats(
+                data_d,
+                [mask_speech],
+                mask_noise,
+                powers=powers,
+                beamformer_type=self.beamformer_type,
+                bdelay=self.bdelay,
+                btaps=self.btaps,
+                eps=self.eps,
+            )
+
+            if self.beamformer_type in ("mvdr", "mpdr", "wmpdr", "wpd"):
+                enhanced, ws = self.apply_beamforming(
+                    data,
+                    ilens,
+                    beamformer_stats["psd_n"],
+                    beamformer_stats["psd_speech"],
+                    psd_distortion=beamformer_stats["psd_distortion"],
                 )
-                enhanced, ws = apply_beamforming(
-                    data, ilens, psd_observed_bar, psd_speech
+            elif (
+                self.beamformer_type.endswith("_souden")
+                or self.beamformer_type == "mwf"
+                or self.beamformer_type == "wmwf"
+                or self.beamformer_type == "sdw_mwf"
+                or self.beamformer_type == "r1mwf"
+                or self.beamformer_type.startswith("gev")
+            ):
+                enhanced, ws = self.apply_beamforming(
+                    data,
+                    ilens,
+                    beamformer_stats["psd_n"],
+                    beamformer_stats["psd_speech"],
                 )
             else:
                 raise ValueError(
@@ -364,107 +267,92 @@ def apply_beamforming(data, ilens, psd_n, psd_speech, psd_distortion=None):
                 mask_speech = list(masks)
                 mask_noise = None
 
-            if self.beamformer_type.startswith(
-                "wmpdr"
-            ) or self.beamformer_type.startswith("wpd"):
-                if powers is None:
-                    power_input = data_d.real ** 2 + data_d.imag ** 2
-                    # Averaging along the channel axis: (..., C, T) -> (..., T)
-                    powers = [
-                        (power_input * m.double()).mean(dim=-2) for m in mask_speech
-                    ]
-                else:
-                    assert len(powers) == self.num_spk, len(powers)
-                inverse_power = [1 / torch.clamp(p, min=self.eps) for p in powers]
-
-            psd_speeches = [
-                get_power_spectral_density_matrix(data_d, mask.double())
-                for mask in mask_speech
-            ]
-            if mask_noise is not None and (
-                self.beamformer_type == "mvdr_souden"
-                or not self.beamformer_type.endswith("_souden")
-            ):
-                # MVDR or other RTF-based formulas
-                psd_noise = get_power_spectral_density_matrix(
-                    data_d, mask_noise.double()
-                )
-            if self.beamformer_type in ("mpdr", "mpdr_souden"):
-                psd_observed = complex_wrapper.einsum(
-                    "...ct,...et->...ce", [data_d, data_d.conj()]
+            beamformer_stats = prepare_beamformer_stats(
+                data_d,
+                mask_speech,
+                mask_noise,
+                powers=powers,
+                beamformer_type=self.beamformer_type,
+                bdelay=self.bdelay,
+                btaps=self.btaps,
+                eps=self.eps,
+            )
+            if self.beamformer_type in ("lcmv", "lcmp", "wlcmp"):
+                rtf_mat = get_rtf_matrix(
+                    beamformer_stats["psd_speech"],
+                    beamformer_stats["psd_distortion"],
+                    diagonal_loading=self.diagonal_loading,
+                    ref_channel=self.ref_channel,
+                    rtf_iterations=self.rtf_iterations,
+                    use_torch_solver=self.use_torch_solver,
+                    diag_eps=self.diag_eps,
                 )
-            elif self.beamformer_type in ("wmpdr", "wmpdr_souden"):
-                psd_observed = [
-                    complex_wrapper.einsum(
-                        "...ct,...et->...ce",
-                        [data_d * inv_p[..., None, :], data_d.conj()],
-                    )
-                    for inv_p in inverse_power
-                ]
-            elif self.beamformer_type in ("wpd", "wpd_souden"):
-                psd_observed_bar = [
-                    get_covariances(
-                        data_d, inv_p, self.bdelay, self.btaps, get_vector=False
-                    )
-                    for inv_p in inverse_power
-                ]
 
             enhanced, ws = [], []
             for i in range(self.num_spk):
-                psd_speech = psd_speeches.pop(i)
-                if (
-                    self.beamformer_type == "mvdr_souden"
-                    or not self.beamformer_type.endswith("_souden")
-                ):
-                    psd_noise_i = (
-                        psd_noise + sum(psd_speeches)
-                        if mask_noise is not None
-                        else sum(psd_speeches)
-                    )
                 # treat all other speakers' psd_speech as noises
-                if self.beamformer_type == "mvdr":
-                    enh, w = apply_beamforming(
-                        data, ilens, psd_noise_i, psd_speech, psd_distortion=psd_noise_i
+                if self.beamformer_type in ("mvdr", "mvdr_tfs", "wmpdr", "wpd"):
+                    enh, w = self.apply_beamforming(
+                        data,
+                        ilens,
+                        beamformer_stats["psd_n"][i],
+                        beamformer_stats["psd_speech"][i],
+                        psd_distortion=beamformer_stats["psd_distortion"][i],
+                    )
+                elif self.beamformer_type in (
+                    "mvdr_souden",
+                    "mvdr_tfs_souden",
+                    "wmpdr_souden",
+                    "wpd_souden",
+                    "wmwf",
+                    "sdw_mwf",
+                    "r1mwf",
+                    "gev",
+                    "gev_ban",
+                ):
+                    enh, w = self.apply_beamforming(
+                        data,
+                        ilens,
+                        beamformer_stats["psd_n"][i],
+                        beamformer_stats["psd_speech"][i],
                     )
-                elif self.beamformer_type == "mvdr_souden":
-                    enh, w = apply_beamforming(data, ilens, psd_noise_i, psd_speech)
                 elif self.beamformer_type == "mpdr":
-                    enh, w = apply_beamforming(
+                    enh, w = self.apply_beamforming(
                         data,
                         ilens,
-                        psd_observed,
-                        psd_speech,
-                        psd_distortion=psd_noise_i,
+                        beamformer_stats["psd_n"],
+                        beamformer_stats["psd_speech"][i],
+                        psd_distortion=beamformer_stats["psd_distortion"][i],
                     )
-                elif self.beamformer_type == "mpdr_souden":
-                    enh, w = apply_beamforming(data, ilens, psd_observed, psd_speech)
-                elif self.beamformer_type == "wmpdr":
-                    enh, w = apply_beamforming(
+                elif self.beamformer_type in ("mpdr_souden", "mwf"):
+                    enh, w = self.apply_beamforming(
                         data,
                         ilens,
-                        psd_observed[i],
-                        psd_speech,
-                        psd_distortion=psd_noise_i,
+                        beamformer_stats["psd_n"],
+                        beamformer_stats["psd_speech"][i],
                     )
-                elif self.beamformer_type == "wmpdr_souden":
-                    enh, w = apply_beamforming(data, ilens, psd_observed[i], psd_speech)
-                elif self.beamformer_type == "wpd":
-                    enh, w = apply_beamforming(
+                elif self.beamformer_type == "lcmp":
+                    enh, w = self.apply_beamforming(
                         data,
                         ilens,
-                        psd_observed_bar[i],
-                        psd_speech,
-                        psd_distortion=psd_noise_i,
+                        beamformer_stats["psd_n"],
+                        beamformer_stats["psd_speech"][i],
+                        rtf_mat=rtf_mat,
+                        spk=i,
                     )
-                elif self.beamformer_type == "wpd_souden":
-                    enh, w = apply_beamforming(
-                        data, ilens, psd_observed_bar[i], psd_speech
+                elif self.beamformer_type in ("lcmv", "wlcmp"):
+                    enh, w = self.apply_beamforming(
+                        data,
+                        ilens,
+                        beamformer_stats["psd_n"][i],
+                        beamformer_stats["psd_speech"][i],
+                        rtf_mat=rtf_mat,
+                        spk=i,
                     )
                 else:
                     raise ValueError(
                         "Not supporting beamformer_type={}".format(self.beamformer_type)
                     )
-                psd_speeches.insert(i, psd_speech)
 
                 # (..., F, T) -> (..., T, F)
                 enh = enh.transpose(-1, -2)
@@ -475,6 +363,206 @@ def apply_beamforming(data, ilens, psd_n, psd_speech, psd_distortion=None):
         masks = [m.transpose(-1, -3) for m in masks]
         return enhanced, ilens, masks
 
+    def apply_beamforming(
+        self,
+        data,
+        ilens,
+        psd_n,
+        psd_speech,
+        psd_distortion=None,
+        rtf_mat=None,
+        spk=0,
+    ):
+        """Beamforming with the provided statistics.
+
+        Args:
+            data (torch.complex64/ComplexTensor): (B, F, C, T)
+            ilens (torch.Tensor): (B,)
+            psd_n (torch.complex64/ComplexTensor):
+                Noise covariance matrix for MVDR (B, F, C, C)
+                Observation covariance matrix for MPDR/wMPDR (B, F, C, C)
+                Stacked observation covariance for WPD (B,F,(btaps+1)*C,(btaps+1)*C)
+            psd_speech (torch.complex64/ComplexTensor):
+                Speech covariance matrix (B, F, C, C)
+            psd_distortion (torch.complex64/ComplexTensor):
+                Noise covariance matrix (B, F, C, C)
+            rtf_mat (torch.complex64/ComplexTensor):
+                RTF matrix (B, F, C, num_spk)
+            spk (int): speaker index
+        Return:
+            enhanced (torch.complex64/ComplexTensor): (B, F, T)
+            ws (torch.complex64/ComplexTensor): (B, F) or (B, F, (btaps+1)*C)
+        """
+        # u: (B, C)
+        if self.ref_channel < 0:
+            u, _ = self.ref(psd_speech.to(dtype=data.dtype), ilens)
+            u = u.double()
+        else:
+            if self.beamformer_type.endswith("_souden"):
+                # (optional) Create onehot vector for fixed reference microphone
+                u = torch.zeros(
+                    *(data.size()[:-3] + (data.size(-2),)),
+                    device=data.device,
+                    dtype=torch.double
+                )
+                u[..., self.ref_channel].fill_(1)
+            else:
+                # for simplifying computation in RTF-based beamforming
+                u = self.ref_channel
+
+        if self.beamformer_type in ("mvdr", "mpdr", "wmpdr"):
+            ws = get_mvdr_vector_with_rtf(
+                to_double(psd_n),
+                to_double(psd_speech),
+                to_double(psd_distortion),
+                iterations=self.rtf_iterations,
+                reference_vector=u,
+                normalize_ref_channel=self.ref_channel,
+                use_torch_solver=self.use_torch_solver,
+                diagonal_loading=self.diagonal_loading,
+                diag_eps=self.diag_eps,
+            )
+            enhanced = apply_beamforming_vector(ws, to_double(data))
+        elif self.beamformer_type == "mvdr_tfs":
+            assert isinstance(psd_n, (list, tuple))
+            ws = [
+                get_mvdr_vector_with_rtf(
+                    to_double(psd_n_i),
+                    to_double(psd_speech),
+                    to_double(psd_distortion),
+                    iterations=self.rtf_iterations,
+                    reference_vector=u,
+                    normalize_ref_channel=self.ref_channel,
+                    use_torch_solver=self.use_torch_solver,
+                    diagonal_loading=self.diagonal_loading,
+                    diag_eps=self.diag_eps,
+                )
+                for psd_n_i in psd_n
+            ]
+            enhanced = stack([apply_beamforming_vector(w, to_double(data)) for w in ws])
+            with torch.no_grad():
+                index = enhanced.abs().argmin(dim=0, keepdims=True)
+            enhanced = enhanced.gather(0, index).squeeze(0)
+            ws = stack(ws, dim=0)
+        elif self.beamformer_type in (
+            "mpdr_souden",
+            "mvdr_souden",
+            "wmpdr_souden",
+        ):
+            ws = get_mvdr_vector(
+                to_double(psd_speech),
+                to_double(psd_n),
+                u,
+                use_torch_solver=self.use_torch_solver,
+                diagonal_loading=self.diagonal_loading,
+                diag_eps=self.diag_eps,
+            )
+            enhanced = apply_beamforming_vector(ws, to_double(data))
+        elif self.beamformer_type == "mvdr_tfs_souden":
+            assert isinstance(psd_n, (list, tuple))
+            ws = [
+                get_mvdr_vector(
+                    to_double(psd_speech),
+                    to_double(psd_n_i),
+                    u,
+                    use_torch_solver=self.use_torch_solver,
+                    diagonal_loading=self.diagonal_loading,
+                    diag_eps=self.diag_eps,
+                )
+                for psd_n_i in psd_n
+            ]
+            enhanced = stack([apply_beamforming_vector(w, to_double(data)) for w in ws])
+            with torch.no_grad():
+                index = enhanced.abs().argmin(dim=0, keepdims=True)
+            enhanced = enhanced.gather(0, index).squeeze(0)
+            ws = stack(ws, dim=0)
+        elif self.beamformer_type == "wpd":
+            ws = get_WPD_filter_with_rtf(
+                to_double(psd_n),
+                to_double(psd_speech),
+                to_double(psd_distortion),
+                iterations=self.rtf_iterations,
+                reference_vector=u,
+                normalize_ref_channel=self.ref_channel,
+                use_torch_solver=self.use_torch_solver,
+                diagonal_loading=self.diagonal_loading,
+                diag_eps=self.diag_eps,
+            )
+            enhanced = perform_WPD_filtering(
+                ws, to_double(data), self.bdelay, self.btaps
+            )
+        elif self.beamformer_type == "wpd_souden":
+            ws = get_WPD_filter_v2(
+                to_double(psd_speech),
+                to_double(psd_n),
+                u,
+                diagonal_loading=self.diagonal_loading,
+                diag_eps=self.diag_eps,
+            )
+            enhanced = perform_WPD_filtering(
+                ws, to_double(data), self.bdelay, self.btaps
+            )
+        elif self.beamformer_type in ("mwf", "wmwf"):
+            ws = get_mwf_vector(
+                to_double(psd_speech),
+                to_double(psd_n),
+                u,
+                use_torch_solver=self.use_torch_solver,
+                diagonal_loading=self.diagonal_loading,
+                diag_eps=self.diag_eps,
+            )
+            enhanced = apply_beamforming_vector(ws, to_double(data))
+        elif self.beamformer_type == "sdw_mwf":
+            ws = get_sdw_mwf_vector(
+                to_double(psd_speech),
+                to_double(psd_n),
+                u,
+                denoising_weight=self.mwf_mu,
+                use_torch_solver=self.use_torch_solver,
+                diagonal_loading=self.diagonal_loading,
+                diag_eps=self.diag_eps,
+            )
+            enhanced = apply_beamforming_vector(ws, to_double(data))
+        elif self.beamformer_type == "r1mwf":
+            ws = get_rank1_mwf_vector(
+                to_double(psd_speech),
+                to_double(psd_n),
+                u,
+                denoising_weight=self.mwf_mu,
+                use_torch_solver=self.use_torch_solver,
+                diagonal_loading=self.diagonal_loading,
+                diag_eps=self.diag_eps,
+            )
+            enhanced = apply_beamforming_vector(ws, to_double(data))
+        elif self.beamformer_type in ("lcmp", "wlcmp", "lcmv"):
+            ws = get_lcmv_vector_with_rtf(
+                to_double(psd_n),
+                to_double(rtf_mat),
+                reference_vector=spk,
+                use_torch_solver=self.use_torch_solver,
+                diagonal_loading=self.diagonal_loading,
+                diag_eps=self.diag_eps,
+            )
+            enhanced = apply_beamforming_vector(ws, to_double(data))
+        elif self.beamformer_type.startswith("gev"):
+            ws = get_gev_vector(
+                to_double(psd_n),
+                to_double(psd_speech),
+                mode="power",
+                diagonal_loading=self.diagonal_loading,
+                diag_eps=self.diag_eps,
+            )
+            enhanced = apply_beamforming_vector(ws, to_double(data))
+            if self.beamformer_type == "gev_ban":
+                gain = blind_analytic_normalization(ws, to_double(psd_n))
+                enhanced = enhanced * gain.unsqueeze(-1)
+        else:
+            raise ValueError(
+                "Not supporting beamformer_type={}".format(self.beamformer_type)
+            )
+
+        return enhanced.to(dtype=data.dtype), ws.to(dtype=data.dtype)
+
     def predict_mask(
         self, data: Union[torch.Tensor, ComplexTensor], ilens: torch.LongTensor
     ) -> Tuple[Tuple[torch.Tensor, ...], torch.LongTensor]:
@@ -526,7 +614,7 @@ def forward(
         psd = (psd.sum(dim=-1) / (C - 1)).transpose(-1, -2)
 
         # Calculate amplitude
-        psd_feat = (psd.real ** 2 + psd.imag ** 2 + self.eps) ** 0.5
+        psd_feat = (psd.real**2 + psd.imag**2 + self.eps) ** 0.5
 
         # (B, C, F) -> (B, C, F2)
         mlp_psd = self.mlp_psd(psd_feat)
diff --git a/espnet2/enh/layers/dnn_wpe.py b/espnet2/enh/layers/dnn_wpe.py
index ddceae66105..f3430087742 100644
--- a/espnet2/enh/layers/dnn_wpe.py
+++ b/espnet2/enh/layers/dnn_wpe.py
@@ -97,7 +97,7 @@ def forward(
 
         for i in range(self.iterations):
             # Calculate power: (..., C, T)
-            power = [enh.real ** 2 + enh.imag ** 2 for enh in enhanced]
+            power = [enh.real**2 + enh.imag**2 for enh in enhanced]
             if i == 0 and self.use_dnn_mask:
                 # mask: (B, F, C, T)
                 masks, _ = self.mask_est(data, ilens)
diff --git a/espnet2/enh/layers/dpmulcat.py b/espnet2/enh/layers/dpmulcat.py
new file mode 100644
index 00000000000..60d6a54ceb1
--- /dev/null
+++ b/espnet2/enh/layers/dpmulcat.py
@@ -0,0 +1,189 @@
+import torch
+import torch.nn as nn
+
+
+class MulCatBlock(nn.Module):
+    """The MulCat block.
+
+    Args:
+        input_size: int, dimension of the input feature.
+            The input should have shape (batch, seq_len, input_size).
+        hidden_size: int, dimension of the hidden state.
+        dropout: float, the dropout rate in the LSTM layer. (Default: 0.0)
+        bidirectional: bool, whether the RNN layers are bidirectional. (Default: True)
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        hidden_size: int,
+        dropout: float = 0.0,
+        bidirectional: bool = True,
+    ):
+        super().__init__()
+
+        num_direction = int(bidirectional) + 1
+
+        self.rnn = nn.LSTM(
+            input_size,
+            hidden_size,
+            1,
+            dropout=dropout,
+            batch_first=True,
+            bidirectional=bidirectional,
+        )
+        self.rnn_proj = nn.Linear(hidden_size * num_direction, input_size)
+
+        self.gate_rnn = nn.LSTM(
+            input_size,
+            hidden_size,
+            num_layers=1,
+            batch_first=True,
+            dropout=dropout,
+            bidirectional=bidirectional,
+        )
+        self.gate_rnn_proj = nn.Linear(hidden_size * num_direction, input_size)
+
+        self.block_projection = nn.Linear(input_size * 2, input_size)
+
+    def forward(self, input):
+        """Compute output after MulCatBlock.
+
+        Args:
+            input (torch.Tensor): The input feature.
+                Tensor of shape (batch, time, feature_dim)
+
+        Returns:
+            (torch.Tensor): The output feature after MulCatBlock.
+                Tensor of shape (batch, time, feature_dim)
+        """
+        orig_shape = input.shape
+        # run rnn module
+        rnn_output, _ = self.rnn(input)
+        rnn_output = (
+            self.rnn_proj(rnn_output.contiguous().view(-1, rnn_output.shape[2]))
+            .view(orig_shape)
+            .contiguous()
+        )
+        # run gate rnn module
+        gate_rnn_output, _ = self.gate_rnn(input)
+        gate_rnn_output = (
+            self.gate_rnn_proj(
+                gate_rnn_output.contiguous().view(-1, gate_rnn_output.shape[2])
+            )
+            .view(orig_shape)
+            .contiguous()
+        )
+        # apply gated rnn
+        gated_output = torch.mul(rnn_output, gate_rnn_output)
+        # concatenate the input with rnn output
+        gated_output = torch.cat([gated_output, input], 2)
+        # linear projection to make the output shape the same as input
+        gated_output = self.block_projection(
+            gated_output.contiguous().view(-1, gated_output.shape[2])
+        ).view(orig_shape)
+        return gated_output
+
+
+class DPMulCat(nn.Module):
+    """Dual-path RNN module with MulCat blocks.
+
+    Args:
+        input_size: int, dimension of the input feature.
+            The input should have shape (batch, seq_len, input_size).
+        hidden_size: int, dimension of the hidden state.
+        output_size: int, dimension of the output size.
+        num_spk: int, the number of speakers in the output.
+        dropout: float, the dropout rate in the LSTM layer. (Default: 0.0)
+        bidirectional: bool, whether the RNN layers are bidirectional. (Default: True)
+        num_layers: int, number of stacked MulCat blocks. (Default: 4)
+        input_normalize: bool, whether to apply GroupNorm on the input Tensor.
+            (Default: False)
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        hidden_size: int,
+        output_size: int,
+        num_spk: int,
+        dropout: float = 0.0,
+        num_layers: int = 4,
+        bidirectional: bool = True,
+        input_normalize: bool = False,
+    ):
+        super().__init__()
+
+        self.rows_grnn = nn.ModuleList([])
+        self.cols_grnn = nn.ModuleList([])
+        self.rows_normalization = nn.ModuleList([])
+        self.cols_normalization = nn.ModuleList([])
+
+        # create the dual path pipeline
+        for i in range(num_layers):
+            self.rows_grnn.append(
+                MulCatBlock(
+                    input_size, hidden_size, dropout, bidirectional=bidirectional
+                )
+            )
+            self.cols_grnn.append(
+                MulCatBlock(
+                    input_size, hidden_size, dropout, bidirectional=bidirectional
+                )
+            )
+            if input_normalize:
+                self.rows_normalization.append(nn.GroupNorm(1, input_size, eps=1e-8))
+                self.cols_normalization.append(nn.GroupNorm(1, input_size, eps=1e-8))
+            else:
+                # used to disable normalization
+                self.rows_normalization.append(nn.Identity())
+                self.cols_normalization.append(nn.Identity())
+
+        self.output = nn.Sequential(
+            nn.PReLU(), nn.Conv2d(input_size, output_size * num_spk, 1)
+        )
+
+    def forward(self, input):
+        """Compute output after DPMulCat module.
+
+        Args:
+            input (torch.Tensor): The input feature.
+                Tensor of shape (batch, N, dim1, dim2)
+                Apply RNN on dim1 first and then dim2
+
+        Returns:
+            (list(torch.Tensor) or list(list(torch.Tensor))
+                In training mode, the module returns output of each DPMulCat block.
+                In eval mode, the module only returns output in the last block.
+        """
+        batch_size, _, d1, d2 = input.shape
+        output = input
+        output_all = []
+        for i in range(len(self.rows_grnn)):
+            row_input = (
+                output.permute(0, 3, 2, 1).contiguous().view(batch_size * d2, d1, -1)
+            )
+            row_output = self.rows_grnn[i](row_input)
+            row_output = (
+                row_output.view(batch_size, d2, d1, -1).permute(0, 3, 2, 1).contiguous()
+            )
+            row_output = self.rows_normalization[i](row_output)
+            # apply a skip connection
+            output = output + row_output
+            col_input = (
+                output.permute(0, 2, 3, 1).contiguous().view(batch_size * d1, d2, -1)
+            )
+            col_output = self.cols_grnn[i](col_input)
+            col_output = (
+                col_output.view(batch_size, d1, d2, -1).permute(0, 3, 1, 2).contiguous()
+            )
+            col_output = self.cols_normalization[i](col_output).contiguous()
+            # apply a skip connection
+            output = output + col_output
+
+            # if training mode, it returns the output Tensor from all layers.
+            # Otherwise, it only returns the one from the last layer.
+            if self.training or i == (len(self.rows_grnn) - 1):
+                output_i = self.output(output)
+                output_all.append(output_i)
+        return output_all
diff --git a/espnet2/enh/layers/dprnn.py b/espnet2/enh/layers/dprnn.py
index 827c754ac86..830e3c59a5e 100644
--- a/espnet2/enh/layers/dprnn.py
+++ b/espnet2/enh/layers/dprnn.py
@@ -4,6 +4,7 @@
 #
 # The code is based on:
 # https://github.com/yluo42/TAC/blob/master/utility/models.py
+# Licensed under CC BY-NC-SA 3.0 US.
 #
 
 
@@ -170,6 +171,175 @@ def forward(self, input):
         return output
 
 
+# dual-path RNN with transform-average-concatenate (TAC)
+class DPRNN_TAC(nn.Module):
+    """Deep duaL-path RNN with TAC applied to each layer/block.
+
+    args:
+        rnn_type: string, select from 'RNN', 'LSTM' and 'GRU'.
+        input_size: int, dimension of the input feature. The input should
+                    have shape (batch, seq_len, input_size).
+        hidden_size: int, dimension of the hidden state.
+        output_size: int, dimension of the output size.
+        dropout: float, dropout ratio. Default is 0.
+        num_layers: int, number of stacked RNN layers. Default is 1.
+        bidirectional: bool, whether the RNN layers are bidirectional.
+                    Default is False.
+    """
+
+    def __init__(
+        self,
+        rnn_type,
+        input_size,
+        hidden_size,
+        output_size,
+        dropout=0,
+        num_layers=1,
+        bidirectional=True,
+    ):
+        super(DPRNN_TAC, self).__init__()
+
+        self.input_size = input_size
+        self.output_size = output_size
+        self.hidden_size = hidden_size
+
+        # DPRNN + TAC for 3D input (ch, N, T)
+        self.row_rnn = nn.ModuleList([])
+        self.col_rnn = nn.ModuleList([])
+        self.ch_transform = nn.ModuleList([])
+        self.ch_average = nn.ModuleList([])
+        self.ch_concat = nn.ModuleList([])
+
+        self.row_norm = nn.ModuleList([])
+        self.col_norm = nn.ModuleList([])
+        self.ch_norm = nn.ModuleList([])
+
+        for i in range(num_layers):
+            self.row_rnn.append(
+                SingleRNN(
+                    rnn_type, input_size, hidden_size, dropout, bidirectional=True
+                )
+            )  # intra-segment RNN is always noncausal
+            self.col_rnn.append(
+                SingleRNN(
+                    rnn_type,
+                    input_size,
+                    hidden_size,
+                    dropout,
+                    bidirectional=bidirectional,
+                )
+            )
+            self.ch_transform.append(
+                nn.Sequential(nn.Linear(input_size, hidden_size * 3), nn.PReLU())
+            )
+            self.ch_average.append(
+                nn.Sequential(nn.Linear(hidden_size * 3, hidden_size * 3), nn.PReLU())
+            )
+            self.ch_concat.append(
+                nn.Sequential(nn.Linear(hidden_size * 6, input_size), nn.PReLU())
+            )
+
+            self.row_norm.append(nn.GroupNorm(1, input_size, eps=1e-8))
+            # default is to use noncausal LayerNorm for
+            # inter-chunk RNN and TAC modules.
+            # For causal setting change them to causal normalization
+            # techniques accordingly.
+            self.col_norm.append(nn.GroupNorm(1, input_size, eps=1e-8))
+            self.ch_norm.append(nn.GroupNorm(1, input_size, eps=1e-8))
+
+        # output layer
+        self.output = nn.Sequential(nn.PReLU(), nn.Conv2d(input_size, output_size, 1))
+
+    def forward(self, input, num_mic):
+        # input shape: batch, ch, N, dim1, dim2
+        # num_mic shape: batch,
+        # apply RNN on dim1 first, then dim2, then ch
+
+        batch_size, ch, N, dim1, dim2 = input.shape
+        output = input
+        for i in range(len(self.row_rnn)):
+            # intra-segment RNN
+            output = output.view(batch_size * ch, N, dim1, dim2)
+            row_input = (
+                output.permute(0, 3, 2, 1)
+                .contiguous()
+                .view(batch_size * ch * dim2, dim1, -1)
+            )  # B*ch*dim2, dim1, N
+            row_output = self.row_rnn[i](row_input)  # B*ch*dim2, dim1, N
+            row_output = (
+                row_output.view(batch_size * ch, dim2, dim1, -1)
+                .permute(0, 3, 2, 1)
+                .contiguous()
+            )  # B*ch, N, dim1, dim2
+            row_output = self.row_norm[i](row_output)
+            output = output + row_output  # B*ch, N, dim1, dim2
+
+            # inter-segment RNN
+            col_input = (
+                output.permute(0, 2, 3, 1)
+                .contiguous()
+                .view(batch_size * ch * dim1, dim2, -1)
+            )  # B*ch*dim1, dim2, N
+            col_output = self.col_rnn[i](col_input)  # B*dim1, dim2, N
+            col_output = (
+                col_output.view(batch_size * ch, dim1, dim2, -1)
+                .permute(0, 3, 1, 2)
+                .contiguous()
+            )  # B*ch, N, dim1, dim2
+            col_output = self.col_norm[i](col_output)
+            output = output + col_output  # B*ch, N, dim1, dim2
+
+            # TAC for cross-channel communication
+            ch_input = output.view(input.shape)  # B, ch, N, dim1, dim2
+            ch_input = (
+                ch_input.permute(0, 3, 4, 1, 2).contiguous().view(-1, N)
+            )  # B*dim1*dim2*ch, N
+            ch_output = self.ch_transform[i](ch_input).view(
+                batch_size, dim1 * dim2, ch, -1
+            )  # B, dim1*dim2, ch, H
+            # mean pooling across channels
+            if num_mic.max() == 0:
+                # fixed geometry array
+                ch_mean = ch_output.mean(2).view(
+                    batch_size * dim1 * dim2, -1
+                )  # B*dim1*dim2, H
+            else:
+                # only consider valid channels
+                ch_mean = [
+                    ch_output[b, :, : num_mic[b]].mean(1).unsqueeze(0)
+                    for b in range(batch_size)
+                ]  # 1, dim1*dim2, H
+                ch_mean = torch.cat(ch_mean, 0).view(
+                    batch_size * dim1 * dim2, -1
+                )  # B*dim1*dim2, H
+            ch_output = ch_output.view(
+                batch_size * dim1 * dim2, ch, -1
+            )  # B*dim1*dim2, ch, H
+            ch_mean = (
+                self.ch_average[i](ch_mean)
+                .unsqueeze(1)
+                .expand_as(ch_output)
+                .contiguous()
+            )  # B*dim1*dim2, ch, H
+            ch_output = torch.cat([ch_output, ch_mean], 2)  # B*dim1*dim2, ch, 2H
+            ch_output = self.ch_concat[i](
+                ch_output.view(-1, ch_output.shape[-1])
+            )  # B*dim1*dim2*ch, N
+            ch_output = (
+                ch_output.view(batch_size, dim1, dim2, ch, -1)
+                .permute(0, 3, 4, 1, 2)
+                .contiguous()
+            )  # B, ch, N, dim1, dim2
+            ch_output = self.ch_norm[i](
+                ch_output.view(batch_size * ch, N, dim1, dim2)
+            )  # B*ch, N, dim1, dim2
+            output = output + ch_output
+
+        output = self.output(output)  # B*ch, N, dim1, dim2
+
+        return output
+
+
 def _pad_segment(input, segment_size):
     # input is the features: (B, N, T)
     batch_size, dim, seq_len = input.shape
diff --git a/espnet2/enh/layers/fasnet.py b/espnet2/enh/layers/fasnet.py
new file mode 100644
index 00000000000..3788fbb08c7
--- /dev/null
+++ b/espnet2/enh/layers/fasnet.py
@@ -0,0 +1,448 @@
+# The implementation of FaSNet in
+# Y. Luo, et al.  “FaSNet: Low-Latency Adaptive Beamforming
+# for Multi-Microphone Audio Processing”
+# The implementation is based on:
+# https://github.com/yluo42/TAC
+# Licensed under CC BY-NC-SA 3.0 US.
+#
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from espnet2.enh.layers import dprnn
+
+
+# DPRNN for beamforming filter estimation
+class BF_module(nn.Module):
+    def __init__(
+        self,
+        input_dim,
+        feature_dim,
+        hidden_dim,
+        output_dim,
+        num_spk=2,
+        layer=4,
+        segment_size=100,
+        bidirectional=True,
+        dropout=0.0,
+        fasnet_type="ifasnet",
+    ):
+        super().__init__()
+
+        assert fasnet_type in [
+            "fasnet",
+            "ifasnet",
+        ], "fasnet_type should be fasnet or ifasnet"
+
+        self.input_dim = input_dim
+        self.feature_dim = feature_dim
+        self.hidden_dim = hidden_dim
+        self.output_dim = output_dim
+
+        self.layer = layer
+        self.segment_size = segment_size
+        self.num_spk = num_spk
+
+        self.dprnn_model = dprnn.DPRNN_TAC(
+            "lstm",
+            self.feature_dim,
+            self.hidden_dim,
+            self.feature_dim * self.num_spk,
+            num_layers=layer,
+            bidirectional=bidirectional,
+            dropout=dropout,
+        )
+        self.eps = 1e-8
+
+        self.fasnet_type = fasnet_type
+
+        if fasnet_type == "ifasnet":
+            # output layer in ifasnet
+            self.output = nn.Conv1d(self.feature_dim, self.output_dim, 1)
+        elif fasnet_type == "fasnet":
+            # gated output layer in ifasnet
+            self.output = nn.Sequential(
+                nn.Conv1d(self.feature_dim, self.output_dim, 1), nn.Tanh()
+            )
+            self.output_gate = nn.Sequential(
+                nn.Conv1d(self.feature_dim, self.output_dim, 1), nn.Sigmoid()
+            )
+
+        self.num_spk = num_spk
+        self.BN = nn.Conv1d(self.input_dim, self.feature_dim, 1, bias=False)
+
+    def forward(self, input, num_mic):
+
+        # input: (B, ch, N, T)
+        batch_size, ch, N, seq_length = input.shape
+
+        input = input.view(batch_size * ch, N, seq_length)  # B*ch, N, T
+        enc_feature = self.BN(input)
+
+        # split the encoder output into overlapped, longer segments
+        enc_segments, enc_rest = dprnn.split_feature(
+            enc_feature, self.segment_size
+        )  # B*ch, N, L, K
+
+        enc_segments = enc_segments.view(
+            batch_size, ch, -1, enc_segments.shape[2], enc_segments.shape[3]
+        )  # B, ch, N, L, K
+        output = self.dprnn_model(enc_segments, num_mic).view(
+            batch_size * ch * self.num_spk,
+            self.feature_dim,
+            self.segment_size,
+            -1,
+        )  # B*ch*nspk, N, L, K
+        # overlap-and-add of the outputs
+        output = dprnn.merge_feature(output, enc_rest)  # B*ch*nspk, N, T
+
+        if self.fasnet_type == "fasnet":
+            # gated output layer for filter generation
+            bf_filter = self.output(output) * self.output_gate(
+                output
+            )  # B*ch*nspk, K, T
+            bf_filter = (
+                bf_filter.transpose(1, 2)
+                .contiguous()
+                .view(batch_size, ch, self.num_spk, -1, self.output_dim)
+            )  # B, ch, nspk, L, N
+
+        elif self.fasnet_type == "ifasnet":
+            # output layer
+            bf_filter = self.output(output)  # B*ch*nspk, K, T
+            bf_filter = bf_filter.view(
+                batch_size, ch, self.num_spk, self.output_dim, -1
+            )  # B, ch, nspk, K, L
+
+        return bf_filter
+
+
+# base module for FaSNet
+class FaSNet_base(nn.Module):
+    def __init__(
+        self,
+        enc_dim,
+        feature_dim,
+        hidden_dim,
+        layer,
+        segment_size=24,
+        nspk=2,
+        win_len=16,
+        context_len=16,
+        dropout=0.0,
+        sr=16000,
+    ):
+        super(FaSNet_base, self).__init__()
+
+        # parameters
+        self.win_len = win_len
+        self.window = max(int(sr * win_len / 1000), 2)
+        self.stride = self.window // 2
+        self.sr = sr
+        self.context_len = context_len
+        self.dropout = dropout
+
+        self.enc_dim = enc_dim
+        self.feature_dim = feature_dim
+        self.hidden_dim = hidden_dim
+        self.segment_size = segment_size
+
+        self.layer = layer
+        self.num_spk = nspk
+        self.eps = 1e-8
+
+    def pad_input(self, input, window):
+        """Zero-padding input according to window/stride size."""
+
+        batch_size, nmic, nsample = input.shape
+
+        stride = self.stride
+
+        # pad the signals at the end for matching the window/stride size
+        rest = window - (stride + nsample % window) % window
+        if rest > 0:
+            pad = torch.zeros(batch_size, nmic, rest).type(input.type())
+            input = torch.cat([input, pad], 2)
+        pad_aux = torch.zeros(batch_size, nmic, stride).type(input.type())
+        input = torch.cat([pad_aux, input, pad_aux], 2)
+
+        return input, rest
+
+    def seg_signal_context(self, x, window, context):
+        """Segmenting the signal into chunks with specific context.
+
+        input:
+            x: size (B, ch, T)
+            window: int
+            context: int
+        """
+
+        # pad input accordingly
+        # first pad according to window size
+        input, rest = self.pad_input(x, window)
+        batch_size, nmic, nsample = input.shape
+        stride = window // 2
+
+        # pad another context size
+        pad_context = torch.zeros(batch_size, nmic, context).type(input.type())
+        input = torch.cat([pad_context, input, pad_context], 2)  # B, ch, L
+
+        # calculate index for each chunk
+        nchunk = 2 * nsample // window - 1
+        begin_idx = np.arange(nchunk) * stride
+        begin_idx = (
+            torch.from_numpy(begin_idx).type(input.type()).long().view(1, 1, -1)
+        )  # 1, 1, nchunk
+        begin_idx = begin_idx.expand(batch_size, nmic, nchunk)  # B, ch, nchunk
+        # select entries from index
+        chunks = [
+            torch.gather(input, 2, begin_idx + i).unsqueeze(3)
+            for i in range(2 * context + window)
+        ]  # B, ch, nchunk, 1
+        chunks = torch.cat(chunks, 3)  # B, ch, nchunk, chunk_size
+
+        # center frame
+        center_frame = chunks[:, :, :, context : context + window]
+
+        return center_frame, chunks, rest
+
+    def signal_context(self, x, context):
+        """signal context function
+
+        Segmenting the signal into chunks with specific context.
+        input:
+            x: size (B, dim, nframe)
+            context: int
+        """
+
+        batch_size, dim, nframe = x.shape
+
+        zero_pad = torch.zeros(batch_size, dim, context).type(x.type())
+        pad_past = []
+        pad_future = []
+        for i in range(context):
+            pad_past.append(
+                torch.cat([zero_pad[:, :, i:], x[:, :, : -context + i]], 2).unsqueeze(2)
+            )
+            pad_future.append(
+                torch.cat([x[:, :, i + 1 :], zero_pad[:, :, : i + 1]], 2).unsqueeze(2)
+            )
+
+        pad_past = torch.cat(pad_past, 2)  # B, D, C, L
+        pad_future = torch.cat(pad_future, 2)  # B, D, C, L
+        all_context = torch.cat(
+            [pad_past, x.unsqueeze(2), pad_future], 2
+        )  # B, D, 2*C+1, L
+
+        return all_context
+
+    def seq_cos_sim(self, ref, target):
+        """Cosine similarity between some reference mics and some target mics
+
+        ref: shape (nmic1, L, seg1)
+        target: shape (nmic2, L, seg2)
+        """
+
+        assert ref.size(1) == target.size(1), "Inputs should have same length."
+        assert ref.size(2) >= target.size(
+            2
+        ), "Reference input should be no smaller than the target input."
+
+        seq_length = ref.size(1)
+
+        larger_ch = ref.size(0)
+        if target.size(0) > ref.size(0):
+            ref = ref.expand(
+                target.size(0), ref.size(1), ref.size(2)
+            ).contiguous()  # nmic2, L, seg1
+            larger_ch = target.size(0)
+        elif target.size(0) < ref.size(0):
+            target = target.expand(
+                ref.size(0), target.size(1), target.size(2)
+            ).contiguous()  # nmic1, L, seg2
+
+        # L2 norms
+        ref_norm = F.conv1d(
+            ref.view(1, -1, ref.size(2)).pow(2),
+            torch.ones(ref.size(0) * ref.size(1), 1, target.size(2)).type(ref.type()),
+            groups=larger_ch * seq_length,
+        )  # 1, larger_ch*L, seg1-seg2+1
+        ref_norm = ref_norm.sqrt() + self.eps
+        target_norm = (
+            target.norm(2, dim=2).view(1, -1, 1) + self.eps
+        )  # 1, larger_ch*L, 1
+        # cosine similarity
+        cos_sim = F.conv1d(
+            ref.view(1, -1, ref.size(2)),
+            target.view(-1, 1, target.size(2)),
+            groups=larger_ch * seq_length,
+        )  # 1, larger_ch*L, seg1-seg2+1
+        cos_sim = cos_sim / (ref_norm * target_norm)
+
+        return cos_sim.view(larger_ch, seq_length, -1)
+
+    def forward(self, input, num_mic):
+        """abstract forward function
+
+        input: shape (batch, max_num_ch, T)
+        num_mic: shape (batch, ), the number of channels for each input.
+                 Zero for fixed geometry configuration.
+        """
+        pass
+
+
+# single-stage FaSNet + TAC
+class FaSNet_TAC(FaSNet_base):
+    def __init__(self, *args, **kwargs):
+        super(FaSNet_TAC, self).__init__(*args, **kwargs)
+
+        self.context = int(self.sr * self.context_len / 1000)
+        self.filter_dim = self.context * 2 + 1
+
+        # DPRNN + TAC for estimation
+        self.all_BF = BF_module(
+            self.filter_dim + self.enc_dim,
+            self.feature_dim,
+            self.hidden_dim,
+            self.filter_dim,
+            self.num_spk,
+            self.layer,
+            self.segment_size,
+            dropout=self.dropout,
+            fasnet_type="fasnet",
+        )
+
+        # waveform encoder
+        self.encoder = nn.Conv1d(
+            1, self.enc_dim, self.context * 2 + self.window, bias=False
+        )
+        self.enc_LN = nn.GroupNorm(1, self.enc_dim, eps=1e-8)
+
+    def forward(self, input, num_mic):
+
+        batch_size = input.size(0)
+        nmic = input.size(1)
+
+        # split input into chunks
+        all_seg, all_mic_context, rest = self.seg_signal_context(
+            input, self.window, self.context
+        )  # B, nmic, L, win/chunk
+        seq_length = all_seg.size(2)
+
+        # embeddings for all channels
+        enc_output = (
+            self.encoder(all_mic_context.view(-1, 1, self.context * 2 + self.window))
+            .view(batch_size * nmic, seq_length, self.enc_dim)
+            .transpose(1, 2)
+            .contiguous()
+        )  # B*nmic, N, L
+        enc_output = self.enc_LN(enc_output).view(
+            batch_size, nmic, self.enc_dim, seq_length
+        )  # B, nmic, N, L
+
+        # calculate the cosine similarities for ref channel's center
+        # frame with all channels' context
+
+        ref_seg = all_seg[:, 0].contiguous().view(1, -1, self.window)  # 1, B*L, win
+        all_context = (
+            all_mic_context.transpose(0, 1)
+            .contiguous()
+            .view(nmic, -1, self.context * 2 + self.window)
+        )  # 1, B*L, 3*win
+        all_cos_sim = self.seq_cos_sim(all_context, ref_seg)  # nmic, B*L, 2*win+1
+        all_cos_sim = (
+            all_cos_sim.view(nmic, batch_size, seq_length, self.filter_dim)
+            .permute(1, 0, 3, 2)
+            .contiguous()
+        )  # B, nmic, 2*win+1, L
+
+        input_feature = torch.cat([enc_output, all_cos_sim], 2)  # B, nmic, N+2*win+1, L
+
+        # pass to DPRNN
+        all_filter = self.all_BF(input_feature, num_mic)  # B, ch, nspk, L, 2*win+1
+
+        # convolve with all mic's context
+        mic_context = torch.cat(
+            [
+                all_mic_context.view(
+                    batch_size * nmic, 1, seq_length, self.context * 2 + self.window
+                )
+            ]
+            * self.num_spk,
+            1,
+        )  # B*nmic, nspk, L, 3*win
+        all_bf_output = F.conv1d(
+            mic_context.view(1, -1, self.context * 2 + self.window),
+            all_filter.view(-1, 1, self.filter_dim),
+            groups=batch_size * nmic * self.num_spk * seq_length,
+        )  # 1, B*nmic*nspk*L, win
+        all_bf_output = all_bf_output.view(
+            batch_size, nmic, self.num_spk, seq_length, self.window
+        )  # B, nmic, nspk, L, win
+
+        # reshape to utterance
+        bf_signal = all_bf_output.view(
+            batch_size * nmic * self.num_spk, -1, self.window * 2
+        )
+        bf_signal1 = (
+            bf_signal[:, :, : self.window]
+            .contiguous()
+            .view(batch_size * nmic * self.num_spk, 1, -1)[:, :, self.stride :]
+        )
+        bf_signal2 = (
+            bf_signal[:, :, self.window :]
+            .contiguous()
+            .view(batch_size * nmic * self.num_spk, 1, -1)[:, :, : -self.stride]
+        )
+        bf_signal = bf_signal1 + bf_signal2  # B*nmic*nspk, 1, T
+        if rest > 0:
+            bf_signal = bf_signal[:, :, :-rest]
+
+        bf_signal = bf_signal.view(
+            batch_size, nmic, self.num_spk, -1
+        )  # B, nmic, nspk, T
+        # consider only the valid channels
+        if num_mic.max() == 0:
+            bf_signal = bf_signal.mean(1)  # B, nspk, T
+        else:
+            bf_signal = [
+                bf_signal[b, : num_mic[b]].mean(0).unsqueeze(0)
+                for b in range(batch_size)
+            ]  # nspk, T
+            bf_signal = torch.cat(bf_signal, 0)  # B, nspk, T
+
+        return bf_signal
+
+
+def test_model(model):
+    x = torch.rand(2, 4, 32000)  # (batch, num_mic, length)
+    num_mic = (
+        torch.from_numpy(np.array([3, 2]))
+        .view(
+            -1,
+        )
+        .type(x.type())
+    )  # ad-hoc array
+    none_mic = torch.zeros(1).type(x.type())  # fixed-array
+    y1 = model(x, num_mic.long())
+    y2 = model(x, none_mic.long())
+    print(y1.shape, y2.shape)  # (batch, nspk, length)
+
+
+if __name__ == "__main__":
+
+    model_TAC = FaSNet_TAC(
+        enc_dim=64,
+        feature_dim=64,
+        hidden_dim=128,
+        layer=4,
+        segment_size=50,
+        nspk=2,
+        win_len=4,
+        context_len=16,
+        sr=16000,
+    )
+
+    test_model(model_TAC)
diff --git a/espnet2/enh/layers/ifasnet.py b/espnet2/enh/layers/ifasnet.py
new file mode 100644
index 00000000000..076898f4b2d
--- /dev/null
+++ b/espnet2/enh/layers/ifasnet.py
@@ -0,0 +1,220 @@
+# The implementation of iFaSNet in
+# Luo. et al. "Implicit Filter-and-sum Network for
+# Multi-channel Speech Separation"
+#
+# The implementation is based on:
+# https://github.com/yluo42/TAC
+# Licensed under CC BY-NC-SA 3.0 US.
+#
+
+import torch
+import torch.nn as nn
+
+from espnet2.enh.layers import dprnn
+from espnet2.enh.layers.fasnet import BF_module
+from espnet2.enh.layers.fasnet import FaSNet_base
+
+
+# implicit FaSNet (iFaSNet)
+class iFaSNet(FaSNet_base):
+    def __init__(self, *args, **kwargs):
+        super(iFaSNet, self).__init__(*args, **kwargs)
+
+        self.context = self.context_len * 2 // self.win_len
+        # context compression
+        self.summ_BN = nn.Linear(self.enc_dim, self.feature_dim)
+        self.summ_RNN = dprnn.SingleRNN(
+            "LSTM", self.feature_dim, self.hidden_dim, bidirectional=True
+        )
+        self.summ_LN = nn.GroupNorm(1, self.feature_dim, eps=self.eps)
+        self.summ_output = nn.Linear(self.feature_dim, self.enc_dim)
+
+        self.separator = BF_module(
+            self.enc_dim + (self.context * 2 + 1) ** 2,
+            self.feature_dim,
+            self.hidden_dim,
+            self.enc_dim,
+            self.num_spk,
+            self.layer,
+            self.segment_size,
+            dropout=self.dropout,
+            fasnet_type="ifasnet",
+        )
+
+        # waveform encoder/decoder
+        self.encoder = nn.Conv1d(
+            1, self.enc_dim, self.window, stride=self.stride, bias=False
+        )
+        self.decoder = nn.ConvTranspose1d(
+            self.enc_dim, 1, self.window, stride=self.stride, bias=False
+        )
+        self.enc_LN = nn.GroupNorm(1, self.enc_dim, eps=self.eps)
+
+        # context decompression
+        self.gen_BN = nn.Conv1d(self.enc_dim * 2, self.feature_dim, 1)
+        self.gen_RNN = dprnn.SingleRNN(
+            "LSTM", self.feature_dim, self.hidden_dim, bidirectional=True
+        )
+        self.gen_LN = nn.GroupNorm(1, self.feature_dim, eps=self.eps)
+        self.gen_output = nn.Conv1d(self.feature_dim, self.enc_dim, 1)
+
+    def forward(self, input, num_mic):
+
+        batch_size = input.size(0)
+        nmic = input.size(1)
+
+        # pad input accordingly
+        input, rest = self.pad_input(input, self.window)
+
+        # encoder on all channels
+        enc_output = self.encoder(input.view(batch_size * nmic, 1, -1))  # B*nmic, N, L
+        seq_length = enc_output.shape[-1]
+
+        # calculate the context of the encoder output
+        # consider both past and future
+        enc_context = self.signal_context(
+            enc_output, self.context
+        )  # B*nmic, N, 2C+1, L
+        enc_context = enc_context.view(
+            batch_size, nmic, self.enc_dim, -1, seq_length
+        )  # B, nmic, N, 2C+1, L
+
+        # NCC feature
+        ref_enc = enc_context[:, 0].contiguous()  # B, N, 2C+1, L
+        ref_enc = (
+            ref_enc.permute(0, 3, 1, 2)
+            .contiguous()
+            .view(batch_size * seq_length, self.enc_dim, -1)
+        )  # B*L, N, 2C+1
+        enc_context_copy = (
+            enc_context.permute(0, 4, 1, 3, 2)
+            .contiguous()
+            .view(batch_size * seq_length, nmic, -1, self.enc_dim)
+        )  # B*L, nmic, 2C+1, N
+        NCC = torch.cat(
+            [enc_context_copy[:, i].bmm(ref_enc).unsqueeze(1) for i in range(nmic)], 1
+        )  # B*L, nmic, 2C+1, 2C+1
+        ref_norm = (
+            ref_enc.pow(2).sum(1).unsqueeze(1) + self.eps
+        ).sqrt()  # B*L, 1, 2C+1
+        enc_norm = (
+            enc_context_copy.pow(2).sum(3).unsqueeze(3) + self.eps
+        ).sqrt()  # B*L, nmic, 2C+1, 1
+        NCC = NCC / (ref_norm.unsqueeze(1) * enc_norm)  # B*L, nmic, 2C+1, 2C+1
+        NCC = torch.cat(
+            [NCC[:, :, i] for i in range(NCC.shape[2])], 2
+        )  # B*L, nmic, (2C+1)^2
+        NCC = (
+            NCC.view(batch_size, seq_length, nmic, -1).permute(0, 2, 3, 1).contiguous()
+        )  # B, nmic, (2C+1)^2, L
+
+        # context compression
+        norm_output = self.enc_LN(enc_output)  # B*nmic, N, L
+        norm_context = self.signal_context(
+            norm_output, self.context
+        )  # B*nmic, N, 2C+1, L
+        norm_context = (
+            norm_context.permute(0, 3, 2, 1)
+            .contiguous()
+            .view(-1, self.context * 2 + 1, self.enc_dim)
+        )
+        norm_context_BN = self.summ_BN(norm_context.view(-1, self.enc_dim)).view(
+            -1, self.context * 2 + 1, self.feature_dim
+        )
+        embedding = (
+            self.summ_RNN(norm_context_BN).transpose(1, 2).contiguous()
+        )  # B*nmic*L, N, 2C+1
+        embedding = norm_context_BN.transpose(1, 2).contiguous() + self.summ_LN(
+            embedding
+        )  # B*nmic*L, N, 2C+1
+        embedding = self.summ_output(embedding.mean(2)).view(
+            batch_size, nmic, seq_length, self.enc_dim
+        )  # B, nmic, L, N
+        embedding = embedding.transpose(2, 3).contiguous()  # B, nmic, N, L
+
+        input_feature = torch.cat([embedding, NCC], 2)  # B, nmic, N+(2C+1)^2, L
+
+        # pass to DPRNN-TAC
+        embedding = self.separator(input_feature, num_mic)[
+            :, 0
+        ].contiguous()  # B, nspk, N, L
+
+        # concatenate with encoder outputs and generate masks
+        # context decompression
+        norm_context = norm_context.view(
+            batch_size, nmic, seq_length, -1, self.enc_dim
+        )  # B, nmic, L, 2C+1, N
+        norm_context = norm_context.permute(0, 1, 4, 3, 2)[
+            :, :1
+        ].contiguous()  # B, 1, N, 2C+1, L
+
+        embedding = torch.cat(
+            [embedding.unsqueeze(3)] * (self.context * 2 + 1), 3
+        )  # B, nspk, N, 2C+1, L
+        norm_context = torch.cat(
+            [norm_context] * self.num_spk, 1
+        )  # B, nspk, N, 2C+1, L
+        embedding = (
+            torch.cat([norm_context, embedding], 2).permute(0, 1, 4, 2, 3).contiguous()
+        )  # B, nspk, L, 2N, 2C+1
+        all_filter = self.gen_BN(
+            embedding.view(-1, self.enc_dim * 2, self.context * 2 + 1)
+        )  # B*nspk*L, N, 2C+1
+        all_filter = all_filter + self.gen_LN(
+            self.gen_RNN(all_filter.transpose(1, 2)).transpose(1, 2)
+        )  # B*nspk*L, N, 2C+1
+        all_filter = self.gen_output(all_filter)  # B*nspk*L, N, 2C+1
+        all_filter = all_filter.view(
+            batch_size, self.num_spk, seq_length, self.enc_dim, -1
+        )  # B, nspk, L, N+1, 2C+1
+        all_filter = all_filter.permute(
+            0, 1, 3, 4, 2
+        ).contiguous()  # B, nspk, N, 2C+1, L
+
+        # apply to with ref mic's encoder context
+        output = (enc_context[:, :1] * all_filter).mean(3)  # B, nspk, N, L
+
+        # decode
+        bf_signal = self.decoder(
+            output.view(batch_size * self.num_spk, self.enc_dim, -1)
+        )  # B*nspk, 1, T
+
+        if rest > 0:
+            bf_signal = bf_signal[:, :, self.stride : -rest - self.stride]
+
+        bf_signal = bf_signal.view(batch_size, self.num_spk, -1)  # B, nspk, T
+
+        return bf_signal
+
+
+def test_model(model):
+    import numpy as np
+
+    x = torch.rand(3, 4, 32000)  # (batch, num_mic, length)
+    num_mic = (
+        torch.from_numpy(np.array([3, 3, 2]))
+        .view(
+            -1,
+        )
+        .type(x.type())
+    )  # ad-hoc array
+    none_mic = torch.zeros(1).type(x.type())  # fixed-array
+    y1 = model(x, num_mic.long())
+    y2 = model(x, none_mic.long())
+    print(y1.shape, y2.shape)  # (batch, nspk, length)
+
+
+if __name__ == "__main__":
+    model_iFaSNet = iFaSNet(
+        enc_dim=64,
+        feature_dim=64,
+        hidden_dim=128,
+        layer=6,
+        segment_size=24,
+        nspk=2,
+        win_len=16,
+        context_len=16,
+        sr=16000,
+    )
+
+    test_model(model_iFaSNet)
diff --git a/espnet2/enh/layers/mask_estimator.py b/espnet2/enh/layers/mask_estimator.py
index 6c463b890f4..daea80f79ec 100644
--- a/espnet2/enh/layers/mask_estimator.py
+++ b/espnet2/enh/layers/mask_estimator.py
@@ -60,7 +60,7 @@ def forward(
 
         # Calculate amplitude: (B, C, T, F) -> (B, C, T, F)
         if is_complex(xs):
-            xs = (xs.real ** 2 + xs.imag ** 2) ** 0.5
+            xs = (xs.real**2 + xs.imag**2) ** 0.5
         # xs: (B, C, T, F) -> xs: (B * C, T, F)
         xs = xs.contiguous().view(-1, xs.size(-2), xs.size(-1))
         # ilens: (B,) -> ilens_: (B * C)
diff --git a/espnet2/enh/layers/skim.py b/espnet2/enh/layers/skim.py
new file mode 100644
index 00000000000..f095f97495c
--- /dev/null
+++ b/espnet2/enh/layers/skim.py
@@ -0,0 +1,316 @@
+# An implementation of SkiM model described in
+# "SkiM: Skipping Memory LSTM for Low-Latency Real-Time Continuous Speech Separation"
+# (https://arxiv.org/abs/2201.10800)
+#
+
+import torch
+import torch.nn as nn
+
+from espnet2.enh.layers.dprnn import merge_feature
+from espnet2.enh.layers.dprnn import SingleRNN
+from espnet2.enh.layers.dprnn import split_feature
+from espnet2.enh.layers.tcn import choose_norm
+
+
+class MemLSTM(nn.Module):
+    """the Mem-LSTM of SkiM
+
+    args:
+        hidden_size: int, dimension of the hidden state.
+        dropout: float, dropout ratio. Default is 0.
+        bidirectional: bool, whether the LSTM layers are bidirectional.
+            Default is False.
+        mem_type: 'hc', 'h', 'c' or 'id'.
+            It controls whether the hidden (or cell) state of
+            SegLSTM will be processed by MemLSTM.
+            In 'id' mode, both the hidden and cell states will
+            be identically returned.
+        norm_type: gLN, cLN. cLN is for causal implementation.
+    """
+
+    def __init__(
+        self,
+        hidden_size,
+        dropout=0.0,
+        bidirectional=False,
+        mem_type="hc",
+        norm_type="cLN",
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.bidirectional = bidirectional
+        self.input_size = (int(bidirectional) + 1) * hidden_size
+        self.mem_type = mem_type
+
+        assert mem_type in [
+            "hc",
+            "h",
+            "c",
+            "id",
+        ], f"only support 'hc', 'h', 'c' and 'id', current type: {mem_type}"
+
+        if mem_type in ["hc", "h"]:
+            self.h_net = SingleRNN(
+                "LSTM",
+                input_size=self.input_size,
+                hidden_size=self.hidden_size,
+                dropout=dropout,
+                bidirectional=bidirectional,
+            )
+            self.h_norm = choose_norm(
+                norm_type=norm_type, channel_size=self.input_size, shape="BTD"
+            )
+        if mem_type in ["hc", "c"]:
+            self.c_net = SingleRNN(
+                "LSTM",
+                input_size=self.input_size,
+                hidden_size=self.hidden_size,
+                dropout=dropout,
+                bidirectional=bidirectional,
+            )
+            self.c_norm = choose_norm(
+                norm_type=norm_type, channel_size=self.input_size, shape="BTD"
+            )
+
+    def extra_repr(self) -> str:
+        return f"Mem_type: {self.mem_type}, bidirectional: {self.bidirectional}"
+
+    def forward(self, hc, S):
+        # hc = (h, c), tuple of hidden and cell states from SegLSTM
+        # shape of h and c: (d, B*S, H)
+        # S: number of segments in SegLSTM
+
+        if self.mem_type == "id":
+            ret_val = hc
+        else:
+            h, c = hc
+            d, BS, H = h.shape
+            B = BS // S
+            h = h.transpose(1, 0).contiguous().view(B, S, d * H)  # B, S, dH
+            c = c.transpose(1, 0).contiguous().view(B, S, d * H)  # B, S, dH
+            if self.mem_type == "hc":
+                h = h + self.h_norm(self.h_net(h))
+                c = c + self.c_norm(self.c_net(c))
+            elif self.mem_type == "h":
+                h = h + self.h_norm(self.h_net(h))
+                c = torch.zeros_like(c)
+            elif self.mem_type == "c":
+                h = torch.zeros_like(h)
+                c = c + self.c_norm(self.c_net(c))
+
+            h = h.view(B * S, d, H).transpose(1, 0).contiguous()
+            c = c.view(B * S, d, H).transpose(1, 0).contiguous()
+            ret_val = (h, c)
+
+        if not self.bidirectional:
+            # for causal setup
+            causal_ret_val = []
+            for x in ret_val:
+                x_ = torch.zeros_like(x)
+                x_[:, 1:, :] = x[:, :-1, :]
+                causal_ret_val.append(x_)
+            ret_val = tuple(causal_ret_val)
+
+        return ret_val
+
+
+class SegLSTM(nn.Module):
+
+    """the Seg-LSTM of SkiM
+
+    args:
+        input_size: int, dimension of the input feature.
+            The input should have shape (batch, seq_len, input_size).
+        hidden_size: int, dimension of the hidden state.
+        dropout: float, dropout ratio. Default is 0.
+        bidirectional: bool, whether the LSTM layers are bidirectional.
+            Default is False.
+        norm_type: gLN, cLN. cLN is for causal implementation.
+    """
+
+    def __init__(
+        self, input_size, hidden_size, dropout=0.0, bidirectional=False, norm_type="cLN"
+    ):
+        super().__init__()
+
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.num_direction = int(bidirectional) + 1
+
+        self.lstm = nn.LSTM(
+            input_size,
+            hidden_size,
+            1,
+            batch_first=True,
+            bidirectional=bidirectional,
+        )
+        self.dropout = nn.Dropout(p=dropout)
+        self.proj = nn.Linear(hidden_size * self.num_direction, input_size)
+        self.norm = choose_norm(
+            norm_type=norm_type, channel_size=input_size, shape="BTD"
+        )
+
+    def forward(self, input, hc):
+        # input shape: B, T, H
+
+        B, T, H = input.shape
+
+        if hc is None:
+            # In fist input SkiM block, h and c are not available
+            d = self.num_direction
+            h = torch.zeros(d, B, self.hidden_size).to(input.device)
+            c = torch.zeros(d, B, self.hidden_size).to(input.device)
+        else:
+            h, c = hc
+
+        output, (h, c) = self.lstm(input, (h, c))
+        output = self.dropout(output)
+        output = self.proj(output.contiguous().view(-1, output.shape[2])).view(
+            input.shape
+        )
+        output = input + self.norm(output)
+
+        return output, (h, c)
+
+
+class SkiM(nn.Module):
+    """Skipping Memory Net
+
+    args:
+        input_size: int, dimension of the input feature.
+            Input shape shoud be (batch, length, input_size)
+        hidden_size: int, dimension of the hidden state.
+        output_size: int, dimension of the output size.
+        dropout: float, dropout ratio. Default is 0.
+        num_blocks: number of basic SkiM blocks
+        segment_size: segmentation size for splitting long features
+        bidirectional: bool, whether the RNN layers are bidirectional.
+        mem_type: 'hc', 'h', 'c', 'id' or None.
+            It controls whether the hidden (or cell) state of SegLSTM
+            will be processed by MemLSTM.
+            In 'id' mode, both the hidden and cell states will
+            be identically returned.
+            When mem_type is None, the MemLSTM will be removed.
+        norm_type: gLN, cLN. cLN is for causal implementation.
+        seg_overlap: Bool, whether the segmentation will reserve 50%
+            overlap for adjacent segments.Default is False.
+    """
+
+    def __init__(
+        self,
+        input_size,
+        hidden_size,
+        output_size,
+        dropout=0.0,
+        num_blocks=2,
+        segment_size=20,
+        bidirectional=True,
+        mem_type="hc",
+        norm_type="gLN",
+        seg_overlap=False,
+    ):
+        super().__init__()
+        self.input_size = input_size
+        self.output_size = output_size
+        self.hidden_size = hidden_size
+        self.segment_size = segment_size
+        self.dropout = dropout
+        self.num_blocks = num_blocks
+        self.mem_type = mem_type
+        self.norm_type = norm_type
+        self.seg_overlap = seg_overlap
+
+        assert mem_type in [
+            "hc",
+            "h",
+            "c",
+            "id",
+            None,
+        ], f"only support 'hc', 'h', 'c', 'id', and None, current type: {mem_type}"
+
+        self.seg_lstms = nn.ModuleList([])
+        for i in range(num_blocks):
+            self.seg_lstms.append(
+                SegLSTM(
+                    input_size=input_size,
+                    hidden_size=hidden_size,
+                    dropout=dropout,
+                    bidirectional=bidirectional,
+                    norm_type=norm_type,
+                )
+            )
+        if self.mem_type is not None:
+            self.mem_lstms = nn.ModuleList([])
+            for i in range(num_blocks - 1):
+                self.mem_lstms.append(
+                    MemLSTM(
+                        hidden_size,
+                        dropout=dropout,
+                        bidirectional=bidirectional,
+                        mem_type=mem_type,
+                        norm_type=norm_type,
+                    )
+                )
+        self.output_fc = nn.Sequential(
+            nn.PReLU(), nn.Conv1d(input_size, output_size, 1)
+        )
+
+    def forward(self, input):
+        # input shape: B, T (S*K), D
+        B, T, D = input.shape
+
+        if self.seg_overlap:
+            input, rest = split_feature(
+                input.transpose(1, 2), segment_size=self.segment_size
+            )  # B, D, K, S
+            input = input.permute(0, 3, 2, 1).contiguous()  # B, S, K, D
+        else:
+            input, rest = self._padfeature(input=input)
+            input = input.view(B, -1, self.segment_size, D)  # B, S, K, D
+        B, S, K, D = input.shape
+
+        assert K == self.segment_size
+
+        output = input.view(B * S, K, D).contiguous()  # BS, K, D
+        hc = None
+        for i in range(self.num_blocks):
+            output, hc = self.seg_lstms[i](output, hc)  # BS, K, D
+            if self.mem_type and i < self.num_blocks - 1:
+                hc = self.mem_lstms[i](hc, S)
+
+        if self.seg_overlap:
+            output = output.view(B, S, K, D).permute(0, 3, 2, 1)  # B, D, K, S
+            output = merge_feature(output, rest)  # B, D, T
+            output = self.output_fc(output).transpose(1, 2)
+
+        else:
+            output = output.view(B, S * K, D)[:, :T, :]  # B, T, D
+            output = self.output_fc(output.transpose(1, 2)).transpose(1, 2)
+
+        return output
+
+    def _padfeature(self, input):
+        B, T, D = input.shape
+        rest = self.segment_size - T % self.segment_size
+
+        if rest > 0:
+            input = torch.nn.functional.pad(input, (0, 0, 0, rest))
+        return input, rest
+
+
+if __name__ == "__main__":
+
+    model = SkiM(
+        333,
+        111,
+        222,
+        dropout=0.1,
+        num_blocks=3,
+        segment_size=20,
+        bidirectional=False,
+        mem_type="hc",
+        norm_type="cLN",
+        seg_overlap=True,
+    )
+    input = torch.randn(2, 1002, 333)
+    print(model(input).shape)
diff --git a/espnet2/enh/layers/tcn.py b/espnet2/enh/layers/tcn.py
index 158038a313f..11cd5fe3170 100644
--- a/espnet2/enh/layers/tcn.py
+++ b/espnet2/enh/layers/tcn.py
@@ -4,6 +4,7 @@
 #
 # The code is based on:
 # https://github.com/kaituoxu/Conv-TasNet/blob/master/src/conv_tasnet.py
+# Licensed under MIT.
 #
 
 
@@ -46,7 +47,7 @@ def __init__(
         for r in range(R):
             blocks = []
             for x in range(X):
-                dilation = 2 ** x
+                dilation = 2**x
                 padding = (P - 1) * dilation if causal else (P - 1) * dilation // 2
                 blocks += [
                     TemporalBlock(
@@ -86,9 +87,9 @@ def forward(self, mixture_w):
         elif self.mask_nonlinear == "relu":
             est_mask = F.relu(score)
         elif self.mask_nonlinear == "sigmoid":
-            est_mask = torch.sigmoid(score)
+            est_mask = F.sigmoid(score)
         elif self.mask_nonlinear == "tanh":
-            est_mask = torch.tanh(score)
+            est_mask = F.tanh(score)
         else:
             raise ValueError("Unsupported mask non-linear function")
         return est_mask
@@ -110,7 +111,7 @@ def __init__(
         # [M, B, K] -> [M, H, K]
         conv1x1 = nn.Conv1d(in_channels, out_channels, 1, bias=False)
         prelu = nn.PReLU()
-        norm = chose_norm(norm_type, out_channels)
+        norm = choose_norm(norm_type, out_channels)
         # [M, H, K] -> [M, B, K]
         dsconv = DepthwiseSeparableConv(
             out_channels,
@@ -169,7 +170,7 @@ def __init__(
         if causal:
             chomp = Chomp1d(padding)
         prelu = nn.PReLU()
-        norm = chose_norm(norm_type, in_channels)
+        norm = choose_norm(norm_type, in_channels)
         # [M, H, K] -> [M, B, K]
         pointwise_conv = nn.Conv1d(in_channels, out_channels, 1, bias=False)
         # Put together
@@ -214,19 +215,21 @@ def check_nonlinear(nolinear_type):
         raise ValueError("Unsupported nonlinear type")
 
 
-def chose_norm(norm_type, channel_size):
+def choose_norm(norm_type, channel_size, shape="BDT"):
     """The input of normalization will be (M, C, K), where M is batch size.
 
     C is channel size and K is sequence length.
     """
     if norm_type == "gLN":
-        return GlobalLayerNorm(channel_size)
+        return GlobalLayerNorm(channel_size, shape=shape)
     elif norm_type == "cLN":
-        return ChannelwiseLayerNorm(channel_size)
+        return ChannelwiseLayerNorm(channel_size, shape=shape)
     elif norm_type == "BN":
         # Given input (M, C, K), nn.BatchNorm1d(C) will accumulate statics
         # along M and K, so this BN usage is right.
         return nn.BatchNorm1d(channel_size)
+    elif norm_type == "GN":
+        return nn.GroupNorm(1, channel_size, eps=1e-8)
     else:
         raise ValueError("Unsupported normalization type")
 
@@ -234,11 +237,13 @@ def chose_norm(norm_type, channel_size):
 class ChannelwiseLayerNorm(nn.Module):
     """Channel-wise Layer Normalization (cLN)."""
 
-    def __init__(self, channel_size):
+    def __init__(self, channel_size, shape="BDT"):
         super().__init__()
         self.gamma = nn.Parameter(torch.Tensor(1, channel_size, 1))  # [1, N, 1]
         self.beta = nn.Parameter(torch.Tensor(1, channel_size, 1))  # [1, N, 1]
         self.reset_parameters()
+        assert shape in ["BDT", "BTD"]
+        self.shape = shape
 
     def reset_parameters(self):
         self.gamma.data.fill_(1)
@@ -253,20 +258,32 @@ def forward(self, y):
         Returns:
             cLN_y: [M, N, K]
         """
+
+        assert y.dim() == 3
+
+        if self.shape == "BTD":
+            y = y.transpose(1, 2).contiguous()
+
         mean = torch.mean(y, dim=1, keepdim=True)  # [M, 1, K]
         var = torch.var(y, dim=1, keepdim=True, unbiased=False)  # [M, 1, K]
         cLN_y = self.gamma * (y - mean) / torch.pow(var + EPS, 0.5) + self.beta
+
+        if self.shape == "BTD":
+            cLN_y = cLN_y.transpose(1, 2).contiguous()
+
         return cLN_y
 
 
 class GlobalLayerNorm(nn.Module):
     """Global Layer Normalization (gLN)."""
 
-    def __init__(self, channel_size):
+    def __init__(self, channel_size, shape="BDT"):
         super().__init__()
         self.gamma = nn.Parameter(torch.Tensor(1, channel_size, 1))  # [1, N, 1]
         self.beta = nn.Parameter(torch.Tensor(1, channel_size, 1))  # [1, N, 1]
         self.reset_parameters()
+        assert shape in ["BDT", "BTD"]
+        self.shape = shape
 
     def reset_parameters(self):
         self.gamma.data.fill_(1)
@@ -281,9 +298,13 @@ def forward(self, y):
         Returns:
             gLN_y: [M, N, K]
         """
-        mean = y.mean(dim=1, keepdim=True).mean(dim=2, keepdim=True)  # [M, 1, 1]
-        var = (
-            (torch.pow(y - mean, 2)).mean(dim=1, keepdim=True).mean(dim=2, keepdim=True)
-        )
+        if self.shape == "BTD":
+            y = y.transpose(1, 2).contiguous()
+
+        mean = y.mean(dim=(1, 2), keepdim=True)  # [M, 1, 1]
+        var = (torch.pow(y - mean, 2)).mean(dim=(1, 2), keepdim=True)
         gLN_y = self.gamma * (y - mean) / torch.pow(var + EPS, 0.5) + self.beta
+
+        if self.shape == "BTD":
+            gLN_y = gLN_y.transpose(1, 2).contiguous()
         return gLN_y
diff --git a/espnet2/enh/layers/wpe.py b/espnet2/enh/layers/wpe.py
index 2da06b7c89e..a9760325030 100644
--- a/espnet2/enh/layers/wpe.py
+++ b/espnet2/enh/layers/wpe.py
@@ -65,7 +65,7 @@ def get_power(signal, dim=-2) -> torch.Tensor:
         Power with shape (F, T)
 
     """
-    power = signal.real ** 2 + signal.imag ** 2
+    power = signal.real**2 + signal.imag**2
     power = power.mean(dim=dim)
     return power
 
diff --git a/espnet2/enh/loss/criterions/abs_loss.py b/espnet2/enh/loss/criterions/abs_loss.py
index 11f8482fe05..c09119c9e07 100644
--- a/espnet2/enh/loss/criterions/abs_loss.py
+++ b/espnet2/enh/loss/criterions/abs_loss.py
@@ -8,6 +8,7 @@
 
 
 class AbsEnhLoss(torch.nn.Module, ABC):
+    """Base class for all Enhancement loss modules."""
 
     # the name will be the key that appears in the reporter
     @property
diff --git a/espnet2/enh/loss/criterions/tf_domain.py b/espnet2/enh/loss/criterions/tf_domain.py
index 1fc42a0cd9d..c94678e4244 100644
--- a/espnet2/enh/loss/criterions/tf_domain.py
+++ b/espnet2/enh/loss/criterions/tf_domain.py
@@ -2,10 +2,14 @@
 from abc import abstractmethod
 from distutils.version import LooseVersion
 from functools import reduce
+import math
 
 import torch
+import torch.nn.functional as F
 
+from espnet2.enh.layers.complex_utils import complex_norm
 from espnet2.enh.layers.complex_utils import is_complex
+from espnet2.enh.layers.complex_utils import new_complex_like
 from espnet2.enh.loss.criterions.abs_loss import AbsEnhLoss
 
 
@@ -14,18 +18,21 @@
 EPS = torch.finfo(torch.get_default_dtype()).eps
 
 
-def _create_mask_label(mix_spec, ref_spec, mask_type="IAM"):
+def _create_mask_label(mix_spec, ref_spec, noise_spec=None, mask_type="IAM"):
     """Create mask label.
 
     Args:
         mix_spec: ComplexTensor(B, T, [C,] F)
         ref_spec: List[ComplexTensor(B, T, [C,] F), ...]
+        noise_spec: ComplexTensor(B, T, [C,] F)
+            only used for IBM and IRM
         mask_type: str
     Returns:
         labels: List[Tensor(B, T, [C,] F), ...] or List[ComplexTensor(B, T, F), ...]
     """
 
     # Must be upper case
+    mask_type = mask_type.upper()
     assert mask_type in [
         "IBM",
         "IRM",
@@ -33,18 +40,27 @@ def _create_mask_label(mix_spec, ref_spec, mask_type="IAM"):
         "PSM",
         "NPSM",
         "PSM^2",
+        "CIRM",
     ], f"mask type {mask_type} not supported"
     mask_label = []
-    for r in ref_spec:
+    if ref_spec[0].ndim < mix_spec.ndim:
+        # (B, T, F) -> (B, T, 1, F)
+        ref_spec = [r.unsqueeze(2).expand_as(mix_spec.real) for r in ref_spec]
+    for idx, r in enumerate(ref_spec):
         mask = None
         if mask_type == "IBM":
-            flags = [abs(r) >= abs(n) for n in ref_spec]
+            if noise_spec is None:
+                flags = [abs(r) >= abs(n) for n in ref_spec]
+            else:
+                flags = [abs(r) >= abs(n) for n in ref_spec + [noise_spec]]
             mask = reduce(lambda x, y: x * y, flags)
             mask = mask.int()
         elif mask_type == "IRM":
-            # TODO(Wangyou): need to fix this,
-            #  as noise referecens are provided separately
-            mask = abs(r) / (sum(([abs(n) for n in ref_spec])) + EPS)
+            beta = 0.5
+            res_spec = sum(n for i, n in enumerate(ref_spec) if i != idx)
+            if noise_spec is not None:
+                res_spec += noise_spec
+            mask = (abs(r).pow(2) / (abs(res_spec).pow(2) + EPS)).pow(beta)
         elif mask_type == "IAM":
             mask = abs(r) / (abs(mix_spec) + EPS)
             mask = mask.clamp(min=0, max=1)
@@ -67,12 +83,19 @@ def _create_mask_label(mix_spec, ref_spec, mask_type="IAM"):
             cos_theta = phase_r.real * phase_mix.real + phase_r.imag * phase_mix.imag
             mask = (abs(r).pow(2) / (abs(mix_spec).pow(2) + EPS)) * cos_theta
             mask = mask.clamp(min=-1, max=1)
+        elif mask_type == "CIRM":
+            # Ref: Complex Ratio Masking for Monaural Speech Separation
+            denominator = mix_spec.real.pow(2) + mix_spec.imag.pow(2) + EPS
+            mask_real = (mix_spec.real * r.real + mix_spec.imag * r.imag) / denominator
+            mask_imag = (mix_spec.real * r.imag - mix_spec.imag * r.real) / denominator
+            mask = new_complex_like(mix_spec, [mask_real, mask_imag])
         assert mask is not None, f"mask type {mask_type} not supported"
         mask_label.append(mask)
     return mask_label
 
 
 class FrequencyDomainLoss(AbsEnhLoss, ABC):
+    """Base class for all frequence-domain Enhancement loss modules."""
 
     # The loss will be computed on mask or on spectrum
     @property
@@ -86,18 +109,28 @@ def compute_on_mask() -> bool:
     def mask_type() -> str:
         pass
 
-    def create_mask_label(self, mix_spec, ref_spec):
+    def create_mask_label(self, mix_spec, ref_spec, noise_spec=None):
         return _create_mask_label(
-            mix_spec=mix_spec, ref_spec=ref_spec, mask_type=self.mask_type
+            mix_spec=mix_spec,
+            ref_spec=ref_spec,
+            noise_spec=noise_spec,
+            mask_type=self.mask_type,
         )
 
 
 class FrequencyDomainMSE(FrequencyDomainLoss):
-    def __init__(self, compute_on_mask=False, mask_type="IBM"):
+    def __init__(self, compute_on_mask=False, mask_type="IBM", name=None):
         super().__init__()
         self._compute_on_mask = compute_on_mask
         self._mask_type = mask_type
 
+        if name is not None:
+            self._name = name
+        elif self.compute_on_mask:
+            self._name = f"MSE_on_{self.mask_type}"
+        else:
+            self._name = "MSE_on_Spec"
+
     @property
     def compute_on_mask(self) -> bool:
         return self._compute_on_mask
@@ -108,10 +141,7 @@ def mask_type(self) -> str:
 
     @property
     def name(self) -> str:
-        if self.compute_on_mask:
-            return f"MSE_on_{self.mask_type}"
-        else:
-            return "MSE_on_Spec"
+        return self._name
 
     def forward(self, ref, inf) -> torch.Tensor:
         """time-frequency MSE loss.
@@ -126,9 +156,9 @@ def forward(self, ref, inf) -> torch.Tensor:
 
         diff = ref - inf
         if is_complex(diff):
-            mseloss = diff.real ** 2 + diff.imag ** 2
+            mseloss = diff.real**2 + diff.imag**2
         else:
-            mseloss = diff ** 2
+            mseloss = diff**2
         if ref.dim() == 3:
             mseloss = mseloss.mean(dim=[1, 2])
         elif ref.dim() == 4:
@@ -141,11 +171,18 @@ def forward(self, ref, inf) -> torch.Tensor:
 
 
 class FrequencyDomainL1(FrequencyDomainLoss):
-    def __init__(self, compute_on_mask=False, mask_type="IBM"):
+    def __init__(self, compute_on_mask=False, mask_type="IBM", name=None):
         super().__init__()
         self._compute_on_mask = compute_on_mask
         self._mask_type = mask_type
 
+        if name is not None:
+            self._name = name
+        elif self.compute_on_mask:
+            self._name = f"L1_on_{self.mask_type}"
+        else:
+            self._name = "L1_on_Spec"
+
     @property
     def compute_on_mask(self) -> bool:
         return self._compute_on_mask
@@ -156,10 +193,7 @@ def mask_type(self) -> str:
 
     @property
     def name(self) -> str:
-        if self.compute_on_mask:
-            return f"L1_on_{self.mask_type}"
-        else:
-            return "L1_on_Spec"
+        return self._name
 
     def forward(self, ref, inf) -> torch.Tensor:
         """time-frequency L1 loss.
@@ -173,7 +207,11 @@ def forward(self, ref, inf) -> torch.Tensor:
         assert ref.shape == inf.shape, (ref.shape, inf.shape)
 
         if is_complex(inf):
-            l1loss = abs(ref - inf + EPS)
+            l1loss = (
+                abs(ref.real - inf.real)
+                + abs(ref.imag - inf.imag)
+                + abs(ref.abs() - inf.abs())
+            )
         else:
             l1loss = abs(ref - inf)
         if ref.dim() == 3:
@@ -185,3 +223,219 @@ def forward(self, ref, inf) -> torch.Tensor:
                 "Invalid input shape: ref={}, inf={}".format(ref.shape, inf.shape)
             )
         return l1loss
+
+
+class FrequencyDomainDPCL(FrequencyDomainLoss):
+    def __init__(
+        self, compute_on_mask=False, mask_type="IBM", loss_type="dpcl", name=None
+    ):
+        super().__init__()
+        self._compute_on_mask = compute_on_mask
+        self._mask_type = mask_type
+        self._loss_type = loss_type
+        self._name = "dpcl" if name is None else name
+
+    @property
+    def compute_on_mask(self) -> bool:
+        return self._compute_on_mask
+
+    @property
+    def mask_type(self) -> str:
+        return self._mask_type
+
+    @property
+    def name(self) -> str:
+        return self._name
+
+    def forward(self, ref, inf) -> torch.Tensor:
+        """time-frequency Deep Clustering loss.
+
+        References:
+            [1] Deep clustering: Discriminative embeddings for segmentation and
+                separation; John R. Hershey. et al., 2016;
+                https://ieeexplore.ieee.org/document/7471631
+            [2] Manifold-Aware Deep Clustering: Maximizing Angles Between Embedding
+                Vectors Based on Regular Simplex; Tanaka, K. et al., 2021;
+                https://www.isca-speech.org/archive/interspeech_2021/tanaka21_interspeech.html
+
+        Args:
+            ref: List[(Batch, T, F) * spks]
+            inf: (Batch, T*F, D)
+        Returns:
+            loss: (Batch,)
+        """  # noqa: E501
+        assert len(ref) > 0
+        num_spk = len(ref)
+
+        # Compute the ref for Deep Clustering[1][2]
+        abs_ref = [abs(n) for n in ref]
+        if self._loss_type == "dpcl":
+            r = torch.zeros_like(abs_ref[0])
+            B = ref[0].shape[0]
+            for i in range(num_spk):
+                flags = [abs_ref[i] >= n for n in abs_ref]
+                mask = reduce(lambda x, y: x * y, flags)
+                mask = mask.int() * i
+                r += mask
+            r = r.contiguous().flatten().long()
+            re = F.one_hot(r, num_classes=num_spk)
+            re = re.contiguous().view(B, -1, num_spk)
+        elif self._loss_type == "mdc":
+            B = ref[0].shape[0]
+            manifold_vector = torch.full(
+                (num_spk, num_spk),
+                (-1 / num_spk) * math.sqrt(num_spk / (num_spk - 1)),
+                dtype=inf.dtype,
+                device=inf.device,
+            )
+            for i in range(num_spk):
+                manifold_vector[i][i] = ((num_spk - 1) / num_spk) * math.sqrt(
+                    num_spk / (num_spk - 1)
+                )
+
+            re = torch.zeros(
+                ref[0].shape[0],
+                ref[0].shape[1],
+                ref[0].shape[2],
+                num_spk,
+                device=inf.device,
+            )
+            for i in range(num_spk):
+                flags = [abs_ref[i] >= n for n in abs_ref]
+                mask = reduce(lambda x, y: x * y, flags)
+                mask = mask.int()
+                re[mask == 1] = manifold_vector[i]
+            re = re.contiguous().view(B, -1, num_spk)
+        else:
+            raise ValueError(
+                f"Invalid loss type error: {self._loss_type}, "
+                'the loss type must be "dpcl" or "mdc"'
+            )
+
+        V2 = torch.matmul(torch.transpose(inf, 2, 1), inf).pow(2).sum(dim=(1, 2))
+        Y2 = (
+            torch.matmul(torch.transpose(re, 2, 1).float(), re.float())
+            .pow(2)
+            .sum(dim=(1, 2))
+        )
+        VY = torch.matmul(torch.transpose(inf, 2, 1), re.float()).pow(2).sum(dim=(1, 2))
+
+        return V2 + Y2 - 2 * VY
+
+
+class FrequencyDomainAbsCoherence(FrequencyDomainLoss):
+    def __init__(self, compute_on_mask=False, mask_type=None, name=None):
+        super().__init__()
+        self._compute_on_mask = False
+        self._mask_type = None
+
+        self._name = "Coherence_on_Spec" if name is None else name
+
+    @property
+    def compute_on_mask(self) -> bool:
+        return self._compute_on_mask
+
+    @property
+    def mask_type(self) -> str:
+        return self._mask_type
+
+    @property
+    def name(self) -> str:
+        return self._name
+
+    def forward(self, ref, inf) -> torch.Tensor:
+        """time-frequency absolute coherence loss.
+
+        Reference:
+            Independent Vector Analysis with Deep Neural Network Source Priors;
+            Li et al 2020; https://arxiv.org/abs/2008.11273
+
+        Args:
+            ref: (Batch, T, F) or (Batch, T, C, F)
+            inf: (Batch, T, F) or (Batch, T, C, F)
+        Returns:
+            loss: (Batch,)
+        """
+        assert ref.shape == inf.shape, (ref.shape, inf.shape)
+
+        if is_complex(ref) and is_complex(inf):
+            # sqrt( E[|inf|^2] * E[|ref|^2] )
+            denom = (
+                complex_norm(ref, dim=1) * complex_norm(inf, dim=1) / ref.size(1) + EPS
+            )
+            coh = (inf * ref.conj()).mean(dim=1).abs() / denom
+            if ref.dim() == 3:
+                coh_loss = 1.0 - coh.mean(dim=1)
+            elif ref.dim() == 4:
+                coh_loss = 1.0 - coh.mean(dim=[1, 2])
+            else:
+                raise ValueError(
+                    "Invalid input shape: ref={}, inf={}".format(ref.shape, inf.shape)
+                )
+        else:
+            raise ValueError("`ref` and `inf` must be complex tensors.")
+        return coh_loss
+
+
+class FrequencyDomainCrossEntropy(FrequencyDomainLoss):
+    def __init__(self, compute_on_mask=False, mask_type=None, name=None):
+        super().__init__()
+        self._compute_on_mask = False
+        self._mask_type = None
+
+        if name is not None:
+            self._name = name
+        elif self.compute_on_mask:
+            self._name = f"CE_on_{self.mask_type}"
+        else:
+            self._name = "CE_on_Spec"
+
+    @property
+    def compute_on_mask(self) -> bool:
+        return self._compute_on_mask
+
+    @property
+    def mask_type(self) -> str:
+        return self._mask_type
+
+    @property
+    def name(self) -> str:
+        return self._name
+
+    def forward(self, ref, inf) -> torch.Tensor:
+        """time-frequency cross-entropy loss.
+
+        Args:
+            ref: (Batch, T) or (Batch, T, C)
+            inf: (Batch, T, nclass) or (Batch, T, C, nclass)
+        Returns:
+            loss: (Batch,)
+        """
+        assert ref.shape[0] == inf.shape[0] and ref.shape[1] == inf.shape[1], (
+            ref.shape,
+            inf.shape,
+        )
+
+        if ref.dim() == 2:
+            loss = torch.nn.functional.cross_entropy(
+                inf.permute(0, 2, 1), ref, reduction="none"
+            ).mean(dim=1)
+        elif ref.dim() == 3:
+            loss = torch.nn.functional.cross_entropy(
+                inf.permute(0, 3, 1, 2), ref, reduction="none"
+            ).mean(dim=[1, 2])
+        else:
+            raise ValueError(
+                "Invalid input shape: ref={}, inf={}".format(ref.shape, inf.shape)
+            )
+
+        with torch.no_grad():
+            pred = inf.argmax(-1)
+            acc = (pred == ref).float()
+            if ref.dim() == 2:
+                acc = acc.mean(dim=1)
+            elif ref.dim() == 3:
+                acc = acc.mean(dim=[1, 2])
+            self.stats = {"acc": acc.cpu() * 100}
+
+        return loss
diff --git a/espnet2/enh/loss/criterions/time_domain.py b/espnet2/enh/loss/criterions/time_domain.py
index 23f5251676f..d000b83fbbb 100644
--- a/espnet2/enh/loss/criterions/time_domain.py
+++ b/espnet2/enh/loss/criterions/time_domain.py
@@ -1,12 +1,17 @@
 from abc import ABC
+import logging
 
 import ci_sdr
+import fast_bss_eval
 import torch
 
+
 from espnet2.enh.loss.criterions.abs_loss import AbsEnhLoss
 
 
 class TimeDomainLoss(AbsEnhLoss, ABC):
+    """Base class for all time-domain Enhancement loss modules."""
+
     pass
 
 
@@ -30,13 +35,15 @@ class CISDRLoss(TimeDomainLoss):
         loss: (Batch,)
     """
 
-    def __init__(self, filter_length=512):
+    def __init__(self, filter_length=512, name=None):
         super().__init__()
         self.filter_length = filter_length
 
+        self._name = "ci_sdr_loss" if name is None else name
+
     @property
     def name(self) -> str:
-        return "ci_sdr_loss"
+        return self._name
 
     def forward(
         self,
@@ -52,13 +59,15 @@ def forward(
 
 
 class SNRLoss(TimeDomainLoss):
-    def __init__(self, eps=EPS):
+    def __init__(self, eps=EPS, name=None):
         super().__init__()
         self.eps = float(eps)
 
+        self._name = "snr_loss" if name is None else name
+
     @property
     def name(self) -> str:
-        return "snr_loss"
+        return self._name
 
     def forward(
         self,
@@ -76,47 +85,200 @@ def forward(
         return -snr
 
 
+class SDRLoss(TimeDomainLoss):
+    """SDR loss.
+
+    filter_length: int
+        The length of the distortion filter allowed (default: ``512``)
+    use_cg_iter:
+        If provided, an iterative method is used to solve for the distortion
+        filter coefficients instead of direct Gaussian elimination.
+        This can speed up the computation of the metrics in case the filters
+        are long. Using a value of 10 here has been shown to provide
+        good accuracy in most cases and is sufficient when using this
+        loss to train neural separation networks.
+    clamp_db: float
+        clamp the output value in  [-clamp_db, clamp_db]
+    zero_mean: bool
+        When set to True, the mean of all signals is subtracted prior.
+    load_diag:
+        If provided, this small value is added to the diagonal coefficients of
+        the system metrics when solving for the filter coefficients.
+        This can help stabilize the metric in the case where some of the reference
+        signals may sometimes be zero
+    """
+
+    def __init__(
+        self,
+        filter_length=512,
+        use_cg_iter=None,
+        clamp_db=None,
+        zero_mean=True,
+        load_diag=None,
+        name=None,
+    ):
+        super().__init__()
+
+        self.filter_length = filter_length
+        self.use_cg_iter = use_cg_iter
+        self.clamp_db = clamp_db
+        self.zero_mean = zero_mean
+        self.load_diag = load_diag
+
+        self._name = "sdr_loss" if name is None else name
+
+    @property
+    def name(self) -> str:
+        return self._name
+
+    def forward(
+        self,
+        ref: torch.Tensor,
+        est: torch.Tensor,
+    ) -> torch.Tensor:
+        """SDR forward.
+
+        Args:
+            ref: Tensor, (..., n_samples)
+                reference signal
+            est: Tensor (..., n_samples)
+                estimated signal
+
+        Returns:
+            loss: (...,)
+                the SDR loss (negative sdr)
+        """
+
+        sdr_loss = fast_bss_eval.sdr_loss(
+            est=est,
+            ref=ref,
+            filter_length=self.filter_length,
+            use_cg_iter=self.use_cg_iter,
+            zero_mean=self.zero_mean,
+            clamp_db=self.clamp_db,
+            load_diag=self.load_diag,
+            pairwise=False,
+        )
+
+        return sdr_loss
+
+
 class SISNRLoss(TimeDomainLoss):
-    def __init__(self, eps=EPS):
+    """SI-SNR (or named SI-SDR) loss
+
+    A more stable SI-SNR loss with clamp from `fast_bss_eval`.
+
+    Attributes:
+        clamp_db: float
+            clamp the output value in  [-clamp_db, clamp_db]
+        zero_mean: bool
+            When set to True, the mean of all signals is subtracted prior.
+        eps: float
+            Deprecated. Keeped for compatibility.
+    """
+
+    def __init__(self, clamp_db=None, zero_mean=True, eps=None, name=None):
         super().__init__()
-        self.eps = float(eps)
+        self.clamp_db = clamp_db
+        self.zero_mean = zero_mean
+        if eps is not None:
+            logging.warning("Eps is deprecated in si_snr loss, set clamp_db instead.")
+
+        self._name = "si_snr_loss" if name is None else name
 
     @property
     def name(self) -> str:
-        return "si_snr_loss"
+        return self._name
 
     def forward(
         self,
         ref: torch.Tensor,
-        inf: torch.Tensor,
+        est: torch.Tensor,
     ) -> torch.Tensor:
-        # the return tensor should be shape of (batch,)
-        assert ref.size() == inf.size()
-        B, T = ref.size()
-
-        # Step 1. Zero-mean norm
-        mean_target = torch.sum(ref, dim=1, keepdim=True) / T
-        mean_estimate = torch.sum(inf, dim=1, keepdim=True) / T
-        zero_mean_target = ref - mean_target
-        zero_mean_estimate = inf - mean_estimate
-
-        # Step 2. SI-SNR with order
-        # reshape to use broadcast
-        s_target = zero_mean_target  # [B, T]
-        s_estimate = zero_mean_estimate  # [B, T]
-        # s_target = <s', s>s / ||s||^2
-        pair_wise_dot = torch.sum(s_estimate * s_target, dim=1, keepdim=True)  # [B, 1]
-        s_target_energy = (
-            torch.sum(s_target ** 2, dim=1, keepdim=True) + self.eps
-        )  # [B, 1]
-        pair_wise_proj = pair_wise_dot * s_target / s_target_energy  # [B, T]
-        # e_noise = s' - s_target
-        e_noise = s_estimate - pair_wise_proj  # [B, T]
-
-        # SI-SNR = 10 * log_10(||s_target||^2 / ||e_noise||^2)
-        pair_wise_si_snr = torch.sum(pair_wise_proj ** 2, dim=1) / (
-            torch.sum(e_noise ** 2, dim=1) + self.eps
+        """SI-SNR forward.
+
+        Args:
+
+            ref: Tensor, (..., n_samples)
+                reference signal
+            est: Tensor (..., n_samples)
+                estimated signal
+
+        Returns:
+            loss: (...,)
+                the SI-SDR loss (negative si-sdr)
+        """
+
+        si_snr = fast_bss_eval.si_sdr_loss(
+            est=est,
+            ref=ref,
+            zero_mean=self.zero_mean,
+            clamp_db=self.clamp_db,
+            pairwise=False,
         )
-        pair_wise_si_snr = 10 * torch.log10(pair_wise_si_snr + self.eps)  # [B]
 
-        return -1 * pair_wise_si_snr
+        return si_snr
+
+
+class TimeDomainMSE(TimeDomainLoss):
+    def __init__(self, name=None):
+        super().__init__()
+        self._name = "TD_MSE_loss" if name is None else name
+
+    @property
+    def name(self) -> str:
+        return self._name
+
+    def forward(self, ref, inf) -> torch.Tensor:
+        """Time-domain MSE loss forward.
+
+        Args:
+            ref: (Batch, T) or (Batch, T, C)
+            inf: (Batch, T) or (Batch, T, C)
+        Returns:
+            loss: (Batch,)
+        """
+        assert ref.shape == inf.shape, (ref.shape, inf.shape)
+
+        mseloss = (ref - inf).pow(2)
+        if ref.dim() == 3:
+            mseloss = mseloss.mean(dim=[1, 2])
+        elif ref.dim() == 2:
+            mseloss = mseloss.mean(dim=1)
+        else:
+            raise ValueError(
+                "Invalid input shape: ref={}, inf={}".format(ref.shape, inf.shape)
+            )
+        return mseloss
+
+
+class TimeDomainL1(TimeDomainLoss):
+    def __init__(self, name=None):
+        super().__init__()
+        self._name = "TD_L1_loss" if name is None else name
+
+    @property
+    def name(self) -> str:
+        return self._name
+
+    def forward(self, ref, inf) -> torch.Tensor:
+        """Time-domain L1 loss forward.
+
+        Args:
+            ref: (Batch, T) or (Batch, T, C)
+            inf: (Batch, T) or (Batch, T, C)
+        Returns:
+            loss: (Batch,)
+        """
+        assert ref.shape == inf.shape, (ref.shape, inf.shape)
+
+        l1loss = abs(ref - inf)
+        if ref.dim() == 3:
+            l1loss = l1loss.mean(dim=[1, 2])
+        elif ref.dim() == 2:
+            l1loss = l1loss.mean(dim=1)
+        else:
+            raise ValueError(
+                "Invalid input shape: ref={}, inf={}".format(ref.shape, inf.shape)
+            )
+        return l1loss
diff --git a/espnet2/enh/loss/wrappers/abs_wrapper.py b/espnet2/enh/loss/wrappers/abs_wrapper.py
index b0761272248..e48a2b7f869 100644
--- a/espnet2/enh/loss/wrappers/abs_wrapper.py
+++ b/espnet2/enh/loss/wrappers/abs_wrapper.py
@@ -8,6 +8,7 @@
 
 
 class AbsLossWrapper(torch.nn.Module, ABC):
+    """Base class for all Enhancement loss wrapper modules."""
 
     # The weight for the current loss in the multi-task learning.
     # The overall training target will be combined as:
diff --git a/espnet2/enh/loss/wrappers/dpcl_solver.py b/espnet2/enh/loss/wrappers/dpcl_solver.py
new file mode 100644
index 00000000000..434f44ea7e2
--- /dev/null
+++ b/espnet2/enh/loss/wrappers/dpcl_solver.py
@@ -0,0 +1,32 @@
+from espnet2.enh.loss.criterions.abs_loss import AbsEnhLoss
+from espnet2.enh.loss.wrappers.abs_wrapper import AbsLossWrapper
+
+
+class DPCLSolver(AbsLossWrapper):
+    def __init__(self, criterion: AbsEnhLoss, weight=1.0):
+        super().__init__()
+        self.criterion = criterion
+        self.weight = weight
+
+    def forward(self, ref, inf, others={}):
+        """A naive DPCL solver
+
+        Args:
+            ref (List[torch.Tensor]): [(batch, ...), ...] x n_spk
+            inf (List[torch.Tensor]): [(batch, ...), ...]
+            others (List): other data included in this solver
+                e.g. "tf_embedding" learned embedding of all T-F bins (B, T * F, D)
+
+        Returns:
+            loss: (torch.Tensor): minimum loss with the best permutation
+            stats: (dict), for collecting training status
+            others: reserved
+        """
+        assert "tf_embedding" in others
+
+        loss = self.criterion(ref, others["tf_embedding"]).mean()
+
+        stats = dict()
+        stats[self.criterion.name] = loss.detach()
+
+        return loss.mean(), stats, {}
diff --git a/espnet2/enh/loss/wrappers/fixed_order.py b/espnet2/enh/loss/wrappers/fixed_order.py
index d37bcfe80d6..2bf1bc0f079 100644
--- a/espnet2/enh/loss/wrappers/fixed_order.py
+++ b/espnet2/enh/loss/wrappers/fixed_order.py
@@ -1,3 +1,5 @@
+from collections import defaultdict
+
 import torch
 
 from espnet2.enh.loss.criterions.abs_loss import AbsEnhLoss
@@ -26,11 +28,14 @@ def forward(self, ref, inf, others={}):
         num_spk = len(ref)
 
         loss = 0.0
-
+        stats = defaultdict(list)
         for r, i in zip(ref, inf):
             loss += torch.mean(self.criterion(r, i)) / num_spk
+            for k, v in getattr(self.criterion, "stats", {}).items():
+                stats[k].append(v)
 
-        stats = dict()
+        for k, v in stats.items():
+            stats[k] = torch.stack(v, dim=1).mean()
         stats[self.criterion.name] = loss.detach()
 
-        return loss.mean(), stats, {}
+        return loss.mean(), dict(stats), {}
diff --git a/espnet2/enh/loss/wrappers/multilayer_pit_solver.py b/espnet2/enh/loss/wrappers/multilayer_pit_solver.py
new file mode 100644
index 00000000000..34ad1d1f28b
--- /dev/null
+++ b/espnet2/enh/loss/wrappers/multilayer_pit_solver.py
@@ -0,0 +1,63 @@
+import torch
+
+from espnet2.enh.loss.criterions.abs_loss import AbsEnhLoss
+from espnet2.enh.loss.wrappers.abs_wrapper import AbsLossWrapper
+from espnet2.enh.loss.wrappers.pit_solver import PITSolver
+
+
+class MultiLayerPITSolver(AbsLossWrapper):
+    def __init__(
+        self,
+        criterion: AbsEnhLoss,
+        weight=1.0,
+        independent_perm=True,
+    ):
+        """Multi-Layer Permutation Invariant Training Solver.
+
+        Compute the PIT loss given inferences of multiple layers and a single reference.
+        It also support single inference and single reference in evaluation stage.
+
+        Args:
+            criterion (AbsEnhLoss): an instance of AbsEnhLoss
+            weight (float): weight (between 0 and 1) of current loss
+                for multi-task learning.
+            independent_perm (bool):
+                If True, PIT will be performed in forward to find the best permutation;
+                If False, the permutation from the last LossWrapper output will be
+                inherited.
+                Note: You should be careful about the ordering of loss
+                wrappers defined in the yaml config, if this argument is False.
+        """
+        super().__init__()
+        self.criterion = criterion
+        self.weight = weight
+        self.independent_perm = independent_perm
+        self.solver = PITSolver(criterion, weight, independent_perm)
+
+    def forward(self, ref, infs, others={}):
+        """Permutation invariant training solver.
+
+        Args:
+            ref (List[torch.Tensor]): [(batch, ...), ...] x n_spk
+            infs (Union[List[torch.Tensor], List[List[torch.Tensor]]]):
+                [(batch, ...), ...]
+
+        Returns:
+            loss: (torch.Tensor): minimum loss with the best permutation
+            stats: dict, for collecting training status
+            others: dict, in this PIT solver, permutation order will be returned
+        """
+        losses = 0.0
+        # In single-layer case, the model only estimates waveforms in the last layer.
+        # The shape of infs is List[torch.Tensor]
+        if torch.is_tensor(infs[0]) and len(infs) == len(ref):
+            loss, stats, others = self.solver(ref, infs, others)
+            losses = loss
+        # In multi-layer case, weighted-sum the PIT loss of each layer
+        # The shape of ins is List[List[torch.Tensor]]
+        else:
+            for idx, inf in enumerate(infs):
+                loss, stats, others = self.solver(ref, inf, others)
+                losses = losses + loss * (idx + 1) * (1.0 / len(infs))
+            losses = losses / len(infs)
+        return losses, stats, others
diff --git a/espnet2/enh/loss/wrappers/pit_solver.py b/espnet2/enh/loss/wrappers/pit_solver.py
index 6860c7fd416..eab7f5e97a4 100644
--- a/espnet2/enh/loss/wrappers/pit_solver.py
+++ b/espnet2/enh/loss/wrappers/pit_solver.py
@@ -1,3 +1,4 @@
+from collections import defaultdict
 from itertools import permutations
 
 import torch
@@ -8,13 +9,26 @@
 
 class PITSolver(AbsLossWrapper):
     def __init__(self, criterion: AbsEnhLoss, weight=1.0, independent_perm=True):
+        """Permutation Invariant Training Solver.
+
+        Args:
+            criterion (AbsEnhLoss): an instance of AbsEnhLoss
+            weight (float): weight (between 0 and 1) of current loss
+                for multi-task learning.
+            independent_perm (bool):
+                If True, PIT will be performed in forward to find the best permutation;
+                If False, the permutation from the last LossWrapper output will be
+                inherited.
+                NOTE (wangyou): You should be careful about the ordering of loss
+                    wrappers defined in the yaml config, if this argument is False.
+        """
         super().__init__()
         self.criterion = criterion
         self.weight = weight
         self.independent_perm = independent_perm
 
     def forward(self, ref, inf, others={}):
-        """Permutation invariant training solver.
+        """PITSolver forward.
 
         Args:
             ref (List[torch.Tensor]): [(batch, ...), ...] x n_spk
@@ -30,9 +44,20 @@ def forward(self, ref, inf, others={}):
         assert len(ref) == len(inf), (len(ref), len(inf))
         num_spk = len(ref)
 
+        stats = defaultdict(list)
+
+        def pre_hook(func, *args, **kwargs):
+            ret = func(*args, **kwargs)
+            for k, v in getattr(self.criterion, "stats", {}).items():
+                stats[k].append(v)
+            return ret
+
         def pair_loss(permutation):
             return sum(
-                [self.criterion(ref[s], inf[t]) for s, t in enumerate(permutation)]
+                [
+                    pre_hook(self.criterion, ref[s], inf[t])
+                    for s, t in enumerate(permutation)
+                ]
             ) / len(permutation)
 
         if self.independent_perm or perm is None:
@@ -40,19 +65,34 @@ def pair_loss(permutation):
             device = ref[0].device
             all_permutations = list(permutations(range(num_spk)))
             losses = torch.stack([pair_loss(p) for p in all_permutations], dim=1)
-            loss, perm = torch.min(losses, dim=1)
+            loss, perm_ = torch.min(losses, dim=1)
             perm = torch.index_select(
                 torch.tensor(all_permutations, device=device, dtype=torch.long),
                 0,
-                perm,
+                perm_,
             )
+            # remove stats from unused permutations
+            for k, v in stats.items():
+                # (B, num_spk * len(all_permutations), ...)
+                new_v = torch.stack(v, dim=1)
+                B, L, *rest = new_v.shape
+                assert L == num_spk * len(all_permutations), (L, num_spk)
+                new_v = new_v.view(B, L // num_spk, num_spk, *rest).mean(2)
+                if new_v.dim() > 2:
+                    shapes = [1 for _ in rest]
+                    perm0 = perm_.view(perm_.shape[0], 1, *shapes).expand(-1, -1, *rest)
+                else:
+                    perm0 = perm_.unsqueeze(1)
+                stats[k] = new_v.gather(1, perm0.to(device=new_v.device)).unbind(1)
         else:
             loss = torch.tensor(
                 [
                     torch.tensor(
                         [
-                            self.criterion(
-                                ref[s][batch].unsqueeze(0), inf[t][batch].unsqueeze(0)
+                            pre_hook(
+                                self.criterion,
+                                ref[s][batch].unsqueeze(0),
+                                inf[t][batch].unsqueeze(0),
                             )
                             for s, t in enumerate(p)
                         ]
@@ -63,7 +103,8 @@ def pair_loss(permutation):
 
         loss = loss.mean()
 
-        stats = dict()
+        for k, v in stats.items():
+            stats[k] = torch.stack(v, dim=1).mean()
         stats[self.criterion.name] = loss.detach()
 
-        return loss.mean(), stats, {"perm": perm}
+        return loss.mean(), dict(stats), {"perm": perm}
diff --git a/espnet2/enh/separator/abs_separator.py b/espnet2/enh/separator/abs_separator.py
index 8b9de626026..72fe2eea918 100644
--- a/espnet2/enh/separator/abs_separator.py
+++ b/espnet2/enh/separator/abs_separator.py
@@ -1,6 +1,8 @@
 from abc import ABC
 from abc import abstractmethod
 from collections import OrderedDict
+from typing import Dict
+from typing import Optional
 from typing import Tuple
 
 import torch
@@ -12,6 +14,7 @@ def forward(
         self,
         input: torch.Tensor,
         ilens: torch.Tensor,
+        additional: Optional[Dict] = None,
     ) -> Tuple[Tuple[torch.Tensor], torch.Tensor, OrderedDict]:
 
         raise NotImplementedError
diff --git a/espnet2/enh/separator/asteroid_models.py b/espnet2/enh/separator/asteroid_models.py
index aad0b1f97af..2310929c1e3 100644
--- a/espnet2/enh/separator/asteroid_models.py
+++ b/espnet2/enh/separator/asteroid_models.py
@@ -1,4 +1,6 @@
 from collections import OrderedDict
+from typing import Dict
+from typing import Optional
 from typing import Tuple
 import warnings
 
@@ -66,12 +68,18 @@ def __init__(
         if loss_type != "si_snr":
             raise ValueError("Unsupported loss type: %s" % loss_type)
 
-    def forward(self, input: torch.Tensor, ilens: torch.Tensor = None):
+    def forward(
+        self,
+        input: torch.Tensor,
+        ilens: torch.Tensor = None,
+        additional: Optional[Dict] = None,
+    ):
         """Whole forward of asteroid models.
 
         Args:
             input (torch.Tensor): Raw Waveforms [B, T]
             ilens (torch.Tensor): input lengths [B]
+            additional (Dict or None): other data included in model
 
         Returns:
             estimated Waveforms(List[Union(torch.Tensor]): [(B, T), ...]
diff --git a/espnet2/enh/separator/conformer_separator.py b/espnet2/enh/separator/conformer_separator.py
index dbc1251d99d..5a9031f441d 100644
--- a/espnet2/enh/separator/conformer_separator.py
+++ b/espnet2/enh/separator/conformer_separator.py
@@ -1,6 +1,8 @@
 from collections import OrderedDict
 from distutils.version import LooseVersion
+from typing import Dict
 from typing import List
+from typing import Optional
 from typing import Tuple
 from typing import Union
 
@@ -118,13 +120,18 @@ def __init__(
         }[nonlinear]
 
     def forward(
-        self, input: Union[torch.Tensor, ComplexTensor], ilens: torch.Tensor
+        self,
+        input: Union[torch.Tensor, ComplexTensor],
+        ilens: torch.Tensor,
+        additional: Optional[Dict] = None,
     ) -> Tuple[List[Union[torch.Tensor, ComplexTensor]], torch.Tensor, OrderedDict]:
         """Forward.
 
         Args:
             input (torch.Tensor or ComplexTensor): Encoded feature [B, T, N]
             ilens (torch.Tensor): input lengths [Batch]
+            additional (Dict or None): other data included in model
+                NOTE: not used in this model
 
         Returns:
             masked (List[Union(torch.Tensor, ComplexTensor)]): [(B, T, N), ...]
diff --git a/espnet2/enh/separator/dan_separator.py b/espnet2/enh/separator/dan_separator.py
new file mode 100644
index 00000000000..d3b3222ae90
--- /dev/null
+++ b/espnet2/enh/separator/dan_separator.py
@@ -0,0 +1,169 @@
+from collections import OrderedDict
+from functools import reduce
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+import torch.nn.functional as Fun
+from torch_complex.tensor import ComplexTensor
+
+from espnet.nets.pytorch_backend.rnn.encoders import RNN
+from espnet2.enh.separator.abs_separator import AbsSeparator
+
+
+class DANSeparator(AbsSeparator):
+    def __init__(
+        self,
+        input_dim: int,
+        rnn_type: str = "blstm",
+        num_spk: int = 2,
+        nonlinear: str = "tanh",
+        layer: int = 2,
+        unit: int = 512,
+        emb_D: int = 40,
+        dropout: float = 0.0,
+    ):
+        """Deep Attractor Network Separator
+
+        Reference:
+            DEEP ATTRACTOR NETWORK FOR SINGLE-MICROPHONE SPEAKER SEPARATION;
+            Zhuo Chen. et al., 2017;
+            https://pubmed.ncbi.nlm.nih.gov/29430212/
+
+        Args:
+            input_dim: input feature dimension
+            rnn_type: string, select from 'blstm', 'lstm' etc.
+            bidirectional: bool, whether the inter-chunk RNN layers are bidirectional.
+            num_spk: number of speakers
+            nonlinear: the nonlinear function for mask estimation,
+                       select from 'relu', 'tanh', 'sigmoid'
+            layer: int, number of stacked RNN layers. Default is 3.
+            unit: int, dimension of the hidden state.
+            emb_D: int, dimension of the attribute vector for one tf-bin.
+            dropout: float, dropout ratio. Default is 0.
+        """
+        super().__init__()
+
+        self._num_spk = num_spk
+
+        self.blstm = RNN(
+            idim=input_dim,
+            elayers=layer,
+            cdim=unit,
+            hdim=unit,
+            dropout=dropout,
+            typ=rnn_type,
+        )
+
+        self.linear = torch.nn.Linear(unit, input_dim * emb_D)
+
+        if nonlinear not in ("sigmoid", "relu", "tanh"):
+            raise ValueError("Not supporting nonlinear={}".format(nonlinear))
+
+        self.nonlinear = {
+            "sigmoid": torch.nn.Sigmoid(),
+            "relu": torch.nn.ReLU(),
+            "tanh": torch.nn.Tanh(),
+        }[nonlinear]
+
+        self.D = emb_D
+
+    def forward(
+        self,
+        input: Union[torch.Tensor, ComplexTensor],
+        ilens: torch.Tensor,
+        additional: Optional[Dict] = None,
+    ) -> Tuple[List[Union[torch.Tensor, ComplexTensor]], torch.Tensor, OrderedDict]:
+        """Forward.
+
+        Args:
+            input (torch.Tensor or ComplexTensor): Encoded feature [B, T, F]
+            ilens (torch.Tensor): input lengths [Batch]
+            additional (Dict or None): other data included in model
+                e.g. "feature_ref": list of reference spectra List[(B, T, F)]
+
+        Returns:
+            masked (List[Union(torch.Tensor, ComplexTensor)]): [(B, T, N), ...]
+            ilens (torch.Tensor): (B,)
+            others predicted data, e.g. masks: OrderedDict[
+                'mask_spk1': torch.Tensor(Batch, Frames, Freq),
+                'mask_spk2': torch.Tensor(Batch, Frames, Freq),
+                ...
+                'mask_spkn': torch.Tensor(Batch, Frames, Freq),
+            ]
+        """
+        # if complex spectrum,
+        if isinstance(input, ComplexTensor):
+            feature = abs(input)
+        else:
+            feature = input
+        B, T, F = input.shape
+        # x:(B, T, F)
+        x, ilens, _ = self.blstm(feature, ilens)
+        # x:(B, T, F*D)
+        x = self.linear(x)
+        # x:(B, T, F*D)
+        x = self.nonlinear(x)
+        # tf_embedding:(B, T*F, D)
+        tf_embedding = x.contiguous().view(B, T * F, -1)
+
+        # Compute the attractors
+        if self.training:
+            assert additional is not None and "feature_ref" in additional
+            origin = additional["feature_ref"]
+            abs_origin = [abs(o) for o in origin]
+            Y_t = torch.zeros(B, T, F, device=origin[0].device)
+            for i in range(self._num_spk):
+                flags = [abs_origin[i] >= o for o in abs_origin]
+                Y = reduce(lambda x, y: x * y, flags)
+                Y = Y.int() * i
+                Y_t += Y
+            Y_t = Y_t.contiguous().flatten().long()
+            Y = Fun.one_hot(Y_t, num_classes=self._num_spk)
+            Y = Y.contiguous().view(B, -1, self._num_spk).float()
+
+            # v_y:(B, D, spks)
+            v_y = torch.bmm(torch.transpose(tf_embedding, 1, 2), Y)
+            # sum_y:(B, D, spks)
+            sum_y = torch.sum(Y, 1, keepdim=True).expand_as(v_y)
+            # attractor:(B, D, spks)
+            attractor = v_y / (sum_y + 1e-8)
+        else:
+            # K-means for batch
+            centers = tf_embedding[:, : self._num_spk, :].detach()
+            dist = torch.empty(B, T * F, self._num_spk, device=tf_embedding.device)
+            last_label = torch.zeros(B, T * F, device=tf_embedding.device)
+            while True:
+                for i in range(self._num_spk):
+                    dist[:, :, i] = torch.sum(
+                        (tf_embedding - centers[:, i, :].unsqueeze(1)) ** 2, dim=2
+                    )
+                label = dist.argmin(dim=2)
+                if torch.sum(label != last_label) == 0:
+                    break
+                last_label = label
+                for b in range(B):
+                    for i in range(self._num_spk):
+                        centers[b, i] = tf_embedding[b, label[b] == i].mean(dim=0)
+            attractor = centers.permute(0, 2, 1)
+
+        # calculate the distance between embeddings and attractors
+        # dist:(B, T*F, spks)
+        dist = torch.bmm(tf_embedding, attractor)
+        masks = torch.softmax(dist, dim=2)
+        masks = masks.contiguous().view(B, T, F, self._num_spk).unbind(dim=3)
+
+        masked = [input * m for m in masks]
+
+        others = OrderedDict(
+            zip(["mask_spk{}".format(i + 1) for i in range(len(masks))], masks)
+        )
+
+        return masked, ilens, others
+
+    @property
+    def num_spk(self):
+        return self._num_spk
diff --git a/espnet2/enh/separator/dc_crn_separator.py b/espnet2/enh/separator/dc_crn_separator.py
new file mode 100644
index 00000000000..fa4ed14bc89
--- /dev/null
+++ b/espnet2/enh/separator/dc_crn_separator.py
@@ -0,0 +1,171 @@
+from collections import OrderedDict
+from distutils.version import LooseVersion
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+from torch_complex.tensor import ComplexTensor
+
+from espnet2.enh.layers.complex_utils import is_complex
+from espnet2.enh.layers.complex_utils import new_complex_like
+from espnet2.enh.layers.dc_crn import DC_CRN
+from espnet2.enh.separator.abs_separator import AbsSeparator
+
+
+EPS = torch.finfo(torch.get_default_dtype()).eps
+is_torch_1_9_plus = LooseVersion(torch.__version__) >= LooseVersion("1.9.0")
+
+
+class DC_CRNSeparator(AbsSeparator):
+    def __init__(
+        self,
+        input_dim: int,
+        num_spk: int = 2,
+        input_channels: List = [2, 16, 32, 64, 128, 256],
+        enc_hid_channels: int = 8,
+        enc_kernel_size: Tuple = (1, 3),
+        enc_padding: Tuple = (0, 1),
+        enc_last_kernel_size: Tuple = (1, 4),
+        enc_last_stride: Tuple = (1, 2),
+        enc_last_padding: Tuple = (0, 1),
+        enc_layers: int = 5,
+        skip_last_kernel_size: Tuple = (1, 3),
+        skip_last_stride: Tuple = (1, 1),
+        skip_last_padding: Tuple = (0, 1),
+        glstm_groups: int = 2,
+        glstm_layers: int = 2,
+        glstm_bidirectional: bool = False,
+        glstm_rearrange: bool = False,
+        mode: str = "masking",
+        ref_channel: int = 0,
+    ):
+        """Densely-Connected Convolutional Recurrent Network (DC-CRN) Separator
+
+        Reference:
+            Deep Learning Based Real-Time Speech Enhancement for Dual-Microphone
+            Mobile Phones; Tan et al., 2020
+            https://web.cse.ohio-state.edu/~wang.77/papers/TZW.taslp21.pdf
+
+        Args:
+            input_dim: input feature dimension
+            num_spk: number of speakers
+            input_channels (list): number of input channels for the stacked
+                DenselyConnectedBlock layers
+                Its length should be (`number of DenselyConnectedBlock layers`).
+            enc_hid_channels (int): common number of intermediate channels for all
+                DenselyConnectedBlock of the encoder
+            enc_kernel_size (tuple): common kernel size for all DenselyConnectedBlock
+                of the encoder
+            enc_padding (tuple): common padding for all DenselyConnectedBlock
+                of the encoder
+            enc_last_kernel_size (tuple): common kernel size for the last Conv layer
+                in all DenselyConnectedBlock of the encoder
+            enc_last_stride (tuple): common stride for the last Conv layer in all
+                DenselyConnectedBlock of the encoder
+            enc_last_padding (tuple): common padding for the last Conv layer in all
+                DenselyConnectedBlock of the encoder
+            enc_layers (int): common total number of Conv layers for all
+                DenselyConnectedBlock layers of the encoder
+            skip_last_kernel_size (tuple): common kernel size for the last Conv layer
+                in all DenselyConnectedBlock of the skip pathways
+            skip_last_stride (tuple): common stride for the last Conv layer in all
+                DenselyConnectedBlock of the skip pathways
+            skip_last_padding (tuple): common padding for the last Conv layer in all
+                DenselyConnectedBlock of the skip pathways
+            glstm_groups (int): number of groups in each Grouped LSTM layer
+            glstm_layers (int): number of Grouped LSTM layers
+            glstm_bidirectional (bool): whether to use BLSTM or unidirectional LSTM
+                in Grouped LSTM layers
+            glstm_rearrange (bool): whether to apply the rearrange operation after each
+                grouped LSTM layer
+            output_channels (int): number of output channels (even number)
+            mode (str): one of ("mapping", "masking")
+                "mapping": complex spectral mapping
+                "masking": complex masking
+            ref_channel (int): index of the reference microphone
+        """
+        super().__init__()
+
+        self._num_spk = num_spk
+        self.mode = mode
+        if mode not in ("mapping", "masking"):
+            raise ValueError("mode=%s is not supported" % mode)
+        self.ref_channel = ref_channel
+
+        self.dc_crn = DC_CRN(
+            input_dim=input_dim,
+            input_channels=input_channels,
+            enc_hid_channels=enc_hid_channels,
+            enc_kernel_size=enc_kernel_size,
+            enc_padding=enc_padding,
+            enc_last_kernel_size=enc_last_kernel_size,
+            enc_last_stride=enc_last_stride,
+            enc_last_padding=enc_last_padding,
+            enc_layers=enc_layers,
+            skip_last_kernel_size=skip_last_kernel_size,
+            skip_last_stride=skip_last_stride,
+            skip_last_padding=skip_last_padding,
+            glstm_groups=glstm_groups,
+            glstm_layers=glstm_layers,
+            glstm_bidirectional=glstm_bidirectional,
+            glstm_rearrange=glstm_rearrange,
+            output_channels=num_spk * 2,
+        )
+
+    def forward(
+        self,
+        input: Union[torch.Tensor, ComplexTensor],
+        ilens: torch.Tensor,
+        additional: Optional[Dict] = None,
+    ) -> Tuple[List[Union[torch.Tensor, ComplexTensor]], torch.Tensor, OrderedDict]:
+        """DC-CRN Separator Forward.
+
+        Args:
+            input (torch.Tensor or ComplexTensor): Encoded feature [Batch, T, F]
+                                                   or [Batch, T, C, F]
+            ilens (torch.Tensor): input lengths [Batch,]
+
+        Returns:
+            masked (List[Union(torch.Tensor, ComplexTensor)]): [(Batch, T, F), ...]
+            ilens (torch.Tensor): (B,)
+            others predicted data, e.g. masks: OrderedDict[
+                'mask_spk1': torch.Tensor(Batch, Frames, Freq),
+                'mask_spk2': torch.Tensor(Batch, Frames, Freq),
+                ...
+                'mask_spkn': torch.Tensor(Batch, Frames, Freq),
+            ]
+        """
+        assert is_complex(input)
+        is_multichannel = input.ndim == 4
+        if is_multichannel:
+            feature = torch.cat([input.real, input.imag], dim=2).permute(0, 2, 1, 3)
+        else:
+            feature = torch.stack([input.real, input.imag], dim=1)
+
+        masks = self.dc_crn(feature)
+        masks = [new_complex_like(input, m.unbind(dim=1)) for m in masks.unbind(dim=2)]
+
+        if self.mode == "masking":
+            if is_multichannel:
+                masked = [input * m.unsqueeze(2) for m in masks]
+            else:
+                masked = [input * m for m in masks]
+        else:
+            masked = masks
+            if is_multichannel:
+                masks = [m.unsqueeze(2) / (input + EPS) for m in masked]
+            else:
+                masks = [m / (input + EPS) for m in masked]
+
+        others = OrderedDict(
+            zip(["mask_spk{}".format(i + 1) for i in range(len(masks))], masks)
+        )
+
+        return masked, ilens, others
+
+    @property
+    def num_spk(self):
+        return self._num_spk
diff --git a/espnet2/enh/separator/dccrn_separator.py b/espnet2/enh/separator/dccrn_separator.py
new file mode 100644
index 00000000000..a97def4e905
--- /dev/null
+++ b/espnet2/enh/separator/dccrn_separator.py
@@ -0,0 +1,366 @@
+from collections import OrderedDict
+from distutils.version import LooseVersion
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch_complex.tensor import ComplexTensor
+
+from espnet2.enh.layers.complexnn import complex_cat
+from espnet2.enh.layers.complexnn import ComplexBatchNorm
+from espnet2.enh.layers.complexnn import ComplexConv2d
+from espnet2.enh.layers.complexnn import ComplexConvTranspose2d
+from espnet2.enh.layers.complexnn import NavieComplexLSTM
+from espnet2.enh.separator.abs_separator import AbsSeparator
+
+is_torch_1_9_plus = LooseVersion(torch.__version__) >= LooseVersion("1.9.0")
+EPS = torch.finfo(torch.double).eps
+
+
+class DCCRNSeparator(AbsSeparator):
+    def __init__(
+        self,
+        input_dim: int,
+        num_spk: int = 1,
+        rnn_layer: int = 2,
+        rnn_units: int = 256,
+        masking_mode: str = "E",
+        use_clstm: bool = True,
+        bidirectional: bool = False,
+        use_cbn: bool = False,
+        kernel_size: int = 5,
+        kernel_num: List[int] = [32, 64, 128, 256, 256, 256],
+        use_builtin_complex: bool = True,
+        use_noise_mask: bool = False,
+    ):
+        """DCCRN separator.
+
+        Args:
+            input_dim (int): input dimension。
+            num_spk (int, optional): number of speakers. Defaults to 1.
+            rnn_layer (int, optional): number of lstm layers in the crn. Defaults to 2.
+            rnn_units (int, optional): rnn units. Defaults to 128.
+            masking_mode (str, optional): usage of the estimated mask. Defaults to "E".
+            use_clstm (bool, optional): whether use complex LSTM. Defaults to False.
+            bidirectional (bool, optional): whether use BLSTM. Defaults to False.
+            use_cbn (bool, optional): whether use complex BN. Defaults to False.
+            kernel_size (int, optional): convolution kernel size. Defaults to 5.
+            kernel_num (list, optional): output dimension of each layer of the encoder.
+            use_builtin_complex (bool, optional): torch.complex if True,
+                                                else ComplexTensor.
+            use_noise_mask (bool, optional): whether to estimate the mask of noise.
+        """
+        super().__init__()
+        self.use_builtin_complex = use_builtin_complex
+        self._num_spk = num_spk
+        self.use_noise_mask = use_noise_mask
+        if masking_mode not in ["C", "E", "R"]:
+            raise ValueError("Unsupported masking mode: %s" % masking_mode)
+        # Network config
+        self.rnn_units = rnn_units
+        self.hidden_layers = rnn_layer
+        self.kernel_size = kernel_size
+        self.kernel_num = [2] + kernel_num
+        self.masking_mode = masking_mode
+        self.use_clstm = use_clstm
+
+        fac = 2 if bidirectional else 1
+
+        self.encoder = nn.ModuleList()
+        self.decoder = nn.ModuleList()
+        for idx in range(len(self.kernel_num) - 1):
+            self.encoder.append(
+                nn.Sequential(
+                    ComplexConv2d(
+                        self.kernel_num[idx],
+                        self.kernel_num[idx + 1],
+                        kernel_size=(self.kernel_size, 2),
+                        stride=(2, 1),
+                        padding=(2, 1),
+                    ),
+                    nn.BatchNorm2d(self.kernel_num[idx + 1])
+                    if not use_cbn
+                    else ComplexBatchNorm(self.kernel_num[idx + 1]),
+                    nn.PReLU(),
+                )
+            )
+        hidden_dim = (input_dim - 1 + 2 ** (len(self.kernel_num) - 1) - 1) // (
+            2 ** (len(self.kernel_num) - 1)
+        )
+        hidden_dim = hidden_dim if hidden_dim > 0 else 1
+
+        if self.use_clstm:
+            rnns = []
+            for idx in range(rnn_layer):
+                rnns.append(
+                    NavieComplexLSTM(
+                        input_size=hidden_dim * self.kernel_num[-1]
+                        if idx == 0
+                        else self.rnn_units * fac,
+                        hidden_size=self.rnn_units,
+                        bidirectional=bidirectional,
+                        batch_first=False,
+                        projection_dim=hidden_dim * self.kernel_num[-1]
+                        if idx == rnn_layer - 1
+                        else None,
+                    )
+                )
+                self.enhance = nn.Sequential(*rnns)
+        else:
+            self.enhance = nn.LSTM(
+                input_size=hidden_dim * self.kernel_num[-1],
+                hidden_size=self.rnn_units,
+                num_layers=2,
+                dropout=0.0,
+                bidirectional=bidirectional,
+                batch_first=False,
+            )
+            self.tranform = nn.Linear(
+                self.rnn_units * fac, hidden_dim * self.kernel_num[-1]
+            )
+
+        for idx in range(len(self.kernel_num) - 1, 0, -1):
+            if idx != 1:
+                self.decoder.append(
+                    nn.Sequential(
+                        ComplexConvTranspose2d(
+                            self.kernel_num[idx] * 2,
+                            self.kernel_num[idx - 1],
+                            kernel_size=(self.kernel_size, 2),
+                            stride=(2, 1),
+                            padding=(2, 0),
+                            output_padding=(1, 0),
+                        ),
+                        nn.BatchNorm2d(self.kernel_num[idx - 1])
+                        if not use_cbn
+                        else ComplexBatchNorm(self.kernel_num[idx - 1]),
+                        nn.PReLU(),
+                    )
+                )
+            else:
+                self.decoder.append(
+                    nn.Sequential(
+                        ComplexConvTranspose2d(
+                            self.kernel_num[idx] * 2,
+                            self.kernel_num[idx - 1] * (self._num_spk + 1)
+                            if self.use_noise_mask
+                            else self.kernel_num[idx - 1] * self._num_spk,
+                            kernel_size=(self.kernel_size, 2),
+                            stride=(2, 1),
+                            padding=(2, 0),
+                            output_padding=(1, 0),
+                        ),
+                    )
+                )
+
+        self.flatten_parameters()
+
+    def forward(
+        self,
+        input: Union[torch.Tensor, ComplexTensor],
+        ilens: torch.Tensor,
+        additional: Optional[Dict] = None,
+    ) -> Tuple[List[Union[torch.Tensor, ComplexTensor]], torch.Tensor, OrderedDict]:
+        """Forward.
+
+        Args:
+            input (torch.Tensor or ComplexTensor): Encoded feature [B, T, F]
+            ilens (torch.Tensor): input lengths [Batch]
+            additional (Dict or None): other data included in model
+                NOTE: not used in this model
+
+        Returns:
+            masked (List[Union(torch.Tensor, ComplexTensor)]): [(B, T, F), ...]
+            ilens (torch.Tensor): (B,)
+            others predicted data, e.g. masks: OrderedDict[
+                'mask_spk1': torch.Tensor(Batch, Frames, Freq),
+                'mask_spk2': torch.Tensor(Batch, Frames, Freq),
+                ...
+                'mask_spkn': torch.Tensor(Batch, Frames, Freq),
+            ]
+        """
+        # shape (B, T, F) --> (B, F, T)
+        specs = input.permute(0, 2, 1)
+        real, imag = specs.real, specs.imag
+
+        # # shape (B, F, T)
+        # spec_mags = torch.sqrt(real**2 + imag**2 + 1e-8)
+        # # shape (B, F, T)
+        # spec_phase = torch.atan2(imag, real)
+        # shape (B, 2, F, T)
+        cspecs = torch.stack([real, imag], 1)
+        # shape (B, 2, F-1, T)
+        cspecs = cspecs[:, :, 1:]
+
+        out = cspecs
+        encoder_out = []
+
+        for idx, layer in enumerate(self.encoder):
+            out = layer(out)
+            encoder_out.append(out)
+        # shape (B, C, F, T)
+        batch_size, channels, dims, lengths = out.size()
+        # shape (T, B, C, F)
+        out = out.permute(3, 0, 1, 2)
+        if self.use_clstm:
+            # shape (T, B, C // 2, F)
+            r_rnn_in = out[:, :, : channels // 2]
+            # shape (T, B, C // 2, F)
+            i_rnn_in = out[:, :, channels // 2 :]
+            # shape (T, B, C // 2 * F)
+            r_rnn_in = torch.reshape(
+                r_rnn_in, [lengths, batch_size, channels // 2 * dims]
+            )
+            # shape (T, B, C // 2 * F)
+            i_rnn_in = torch.reshape(
+                i_rnn_in, [lengths, batch_size, channels // 2 * dims]
+            )
+            r_rnn_in, i_rnn_in = self.enhance([r_rnn_in, i_rnn_in])
+            # shape (T, B, C // 2, F)
+            r_rnn_in = torch.reshape(
+                r_rnn_in, [lengths, batch_size, channels // 2, dims]
+            )
+            # shape (T, B, C // 2, F)
+            i_rnn_in = torch.reshape(
+                i_rnn_in, [lengths, batch_size, channels // 2, dims]
+            )
+            # shape (T, B, C, F)
+            out = torch.cat([r_rnn_in, i_rnn_in], 2)
+
+        else:
+            # shape (T, B, C*F)
+            out = torch.reshape(out, [lengths, batch_size, channels * dims])
+            out, _ = self.enhance(out)
+            out = self.tranform(out)
+            # shape (T, B, C, F)
+            out = torch.reshape(out, [lengths, batch_size, channels, dims])
+        # shape (B, C, F, T)
+        out = out.permute(1, 2, 3, 0)
+
+        for idx in range(len(self.decoder)):
+            # skip connection
+            out = complex_cat([out, encoder_out[-1 - idx]], 1)
+            out = self.decoder[idx](out)
+            out = out[..., 1:]
+        # out shape = (B, 2*num_spk, F-1, T) if self.use_noise_mask == False
+        # else (B, 2*(num_spk+1), F-1, T)
+
+        masks = self.create_masks(out)
+        masked = self.apply_masks(masks, real, imag)
+        others = OrderedDict(
+            zip(
+                ["mask_spk{}".format(i + 1) for i in range(self.num_spk)],
+                masks,
+            )
+        )
+
+        if self.use_noise_mask:
+            others["mask_noise1"] = masks[-1]
+            others["noise1"] = masked.pop(-1)
+
+        return (masked, ilens, others)
+
+    def flatten_parameters(self):
+        if isinstance(self.enhance, nn.LSTM):
+            self.enhance.flatten_parameters()
+
+    def create_masks(self, mask_tensor: torch.Tensor):
+        """create estimated mask for each speaker
+
+        Args:
+            mask_tensor (torch.Tensor): output of decoder, shape(B, 2*num_spk, F-1, T)
+        """
+        if self.use_noise_mask:
+            assert mask_tensor.shape[1] == 2 * (self._num_spk + 1), mask_tensor.shape[1]
+        else:
+            assert mask_tensor.shape[1] == 2 * self._num_spk, mask_tensor.shape[1]
+
+        masks = []
+        for idx in range(mask_tensor.shape[1] // 2):
+            # shape (B, F-1, T)
+            mask_real = mask_tensor[:, idx * 2]
+            # shape (B, F-1, T)
+            mask_imag = mask_tensor[:, idx * 2 + 1]
+            # shape (B, F, T)
+            mask_real = F.pad(mask_real, [0, 0, 1, 0])
+            # shape (B, F, T)
+            mask_imag = F.pad(mask_imag, [0, 0, 1, 0])
+
+            # mask shape (B, T, F)
+            if is_torch_1_9_plus and self.use_builtin_complex:
+                complex_mask = torch.complex(
+                    mask_real.permute(0, 2, 1), mask_imag.permute(0, 2, 1)
+                )
+            else:
+                complex_mask = ComplexTensor(
+                    mask_real.permute(0, 2, 1), mask_imag.permute(0, 2, 1)
+                )
+
+            masks.append(complex_mask)
+
+        return masks
+
+    def apply_masks(
+        self,
+        masks: List[Union[torch.Tensor, ComplexTensor]],
+        real: torch.Tensor,
+        imag: torch.Tensor,
+    ):
+        """apply masks
+
+        Args:
+            masks : est_masks, [(B, T, F), ...]
+            real (torch.Tensor): real part of the noisy spectrum, (B, F, T)
+            imag (torch.Tensor): imag part of the noisy spectrum, (B, F, T)
+
+        Returns:
+            masked (List[Union(torch.Tensor, ComplexTensor)]): [(B, T, F), ...]
+        """
+        masked = []
+        for i in range(len(masks)):
+            # shape (B, T, F) --> (B, F, T)
+            mask_real = masks[i].real.permute(0, 2, 1)
+            mask_imag = masks[i].imag.permute(0, 2, 1)
+            if self.masking_mode == "E":
+                # shape (B, F, T)
+                spec_mags = torch.sqrt(real**2 + imag**2 + 1e-8)
+                # shape (B, F, T)
+                spec_phase = torch.atan2(imag, real)
+                mask_mags = (mask_real**2 + mask_imag**2) ** 0.5
+                # mask_mags = (mask_real ** 2 + mask_imag ** 2 + EPS) ** 0.5
+                real_phase = mask_real / (mask_mags + EPS)
+                imag_phase = mask_imag / (mask_mags + EPS)
+                # mask_phase = torch.atan2(imag_phase + EPS, real_phase + EPS)
+                mask_phase = torch.atan2(imag_phase, real_phase)
+                mask_mags = torch.tanh(mask_mags)
+                est_mags = mask_mags * spec_mags
+                est_phase = spec_phase + mask_phase
+                real = est_mags * torch.cos(est_phase)
+                imag = est_mags * torch.sin(est_phase)
+            elif self.masking_mode == "C":
+                real, imag = (
+                    real * mask_real - imag * mask_imag,
+                    real * mask_imag + imag * mask_real,
+                )
+            elif self.masking_mode == "R":
+                real, imag = real * mask_real, imag * mask_imag
+
+            # shape (B, F, T) --> (B, T, F)
+            if is_torch_1_9_plus and self.use_builtin_complex:
+                masked.append(
+                    torch.complex(real.permute(0, 2, 1), imag.permute(0, 2, 1))
+                )
+            else:
+                masked.append(
+                    ComplexTensor(real.permute(0, 2, 1), imag.permute(0, 2, 1))
+                )
+        return masked
+
+    @property
+    def num_spk(self):
+        return self._num_spk
diff --git a/espnet2/enh/separator/dpcl_e2e_separator.py b/espnet2/enh/separator/dpcl_e2e_separator.py
new file mode 100644
index 00000000000..35264c5c137
--- /dev/null
+++ b/espnet2/enh/separator/dpcl_e2e_separator.py
@@ -0,0 +1,186 @@
+from collections import OrderedDict
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+from torch_complex.tensor import ComplexTensor
+
+from espnet.nets.pytorch_backend.rnn.encoders import RNN
+from espnet2.enh.separator.abs_separator import AbsSeparator
+
+
+class DPCLE2ESeparator(AbsSeparator):
+    def __init__(
+        self,
+        input_dim: int,
+        rnn_type: str = "blstm",
+        num_spk: int = 2,
+        nonlinear: str = "tanh",
+        layer: int = 2,
+        unit: int = 512,
+        emb_D: int = 40,
+        dropout: float = 0.0,
+        alpha: float = 5.0,
+        max_iteration: int = 500,
+        threshold: float = 1.0e-05,
+    ):
+        """Deep Clustering End-to-End Separator
+
+        References:
+            Single-Channel Multi-Speaker Separation using Deep Clustering;
+            Yusuf Isik. et al., 2016;
+            https://www.isca-speech.org/archive/interspeech_2016/isik16_interspeech.html
+
+        Args:
+            input_dim: input feature dimension
+            rnn_type: string, select from 'blstm', 'lstm' etc.
+            bidirectional: bool, whether the inter-chunk RNN layers are bidirectional.
+            num_spk: number of speakers
+            nonlinear: the nonlinear function for mask estimation,
+                       select from 'relu', 'tanh', 'sigmoid'
+            layer: int, number of stacked RNN layers. Default is 3.
+            unit: int, dimension of the hidden state.
+            emb_D: int, dimension of the feature vector for a tf-bin.
+            dropout: float, dropout ratio. Default is 0.
+            alpha: float, the clustering hardness parameter.
+            max_iteration: int, the max iterations of soft kmeans.
+            threshold: float, the threshold to end the soft k-means process.
+        """
+        super().__init__()
+
+        self._num_spk = num_spk
+
+        self.blstm = RNN(
+            idim=input_dim,
+            elayers=layer,
+            cdim=unit,
+            hdim=unit,
+            dropout=dropout,
+            typ=rnn_type,
+        )
+
+        self.linear = torch.nn.Linear(unit, input_dim * emb_D)
+
+        if nonlinear not in ("sigmoid", "relu", "tanh"):
+            raise ValueError("Not supporting nonlinear={}".format(nonlinear))
+
+        self.nonlinear = {
+            "sigmoid": torch.nn.Sigmoid(),
+            "relu": torch.nn.ReLU(),
+            "tanh": torch.nn.Tanh(),
+        }[nonlinear]
+
+        self.enh_blstm = RNN(
+            idim=input_dim * (num_spk + 1),
+            elayers=1,
+            cdim=unit,
+            hdim=unit,
+            dropout=dropout,
+            typ=rnn_type,
+        )
+
+        self.enh_linear = torch.nn.Linear(unit, input_dim * num_spk)
+
+        self.D = emb_D
+        self.alpha = alpha
+        self.max_iteration = max_iteration
+        self.threshold = threshold
+
+    def forward(
+        self,
+        input: Union[torch.Tensor, ComplexTensor],
+        ilens: torch.Tensor,
+        additional: Optional[Dict] = None,
+    ) -> Tuple[List[Union[torch.Tensor, ComplexTensor]], torch.Tensor, OrderedDict]:
+        """Forward.
+
+        Args:
+            input (torch.Tensor or ComplexTensor): Encoded feature [B, T, F]
+            ilens (torch.Tensor): input lengths [Batch]
+
+        Returns:
+            masked (List[Union(torch.Tensor, ComplexTensor)]): [(B, T, N), ...]
+            ilens (torch.Tensor): (B,)
+            others predicted data, e.g. V: OrderedDict[
+                others predicted data, e.g. masks: OrderedDict[
+                'mask_spk1': torch.Tensor(Batch, Frames, Freq),
+                'mask_spk2': torch.Tensor(Batch, Frames, Freq),
+                ...
+                'mask_spkn': torch.Tensor(Batch, Frames, Freq),
+            ]
+        """
+
+        # if complex spectrum,
+        if isinstance(input, ComplexTensor):
+            feature = abs(input)
+        else:
+            feature = input
+        B, T, F = input.shape
+
+        # 1st Stage
+        # x:(B, T, F)
+        x, ilens, _ = self.blstm(feature, ilens)
+        # x:(B, T, F*D)
+        x = self.linear(x)
+        # x:(B, T, F*D)
+        x = self.nonlinear(x)
+        V = x.view(B, -1, self.D)
+
+        # Soft KMeans
+        centers = V[:, : self._num_spk, :]
+        gamma = torch.zeros(B, T * F, self._num_spk, device=input.device)
+        count = 0
+        while True:
+            # Compute weight
+            gamma_exp = torch.empty(B, T * F, self._num_spk, device=input.device)
+            new_centers = torch.empty(B, self._num_spk, self.D, device=input.device)
+            for i in range(self._num_spk):
+                gamma_exp[:, :, i] = torch.exp(
+                    -self.alpha
+                    * torch.sum(V - centers[:, i, :].unsqueeze(1) ** 2, dim=2)
+                )
+            # To avoid grad becomes nan, we add a small constant in denominator
+            gamma = gamma_exp / (torch.sum(gamma_exp, dim=2, keepdim=True) + 1.0e-8)
+            # Update centers
+            for i in range(self._num_spk):
+                new_centers[:, i, :] = torch.sum(
+                    V * gamma[:, :, i].unsqueeze(2), dim=1
+                ) / (torch.sum(gamma[:, :, i].unsqueeze(2), dim=1) + 1.0e-8)
+
+            if (
+                torch.pow(new_centers - centers, 2).sum() < self.threshold
+                or count > self.max_iteration
+            ):
+                break
+
+            count += 1
+            centers = new_centers
+
+        masks = gamma.contiguous().view(B, T, F, self._num_spk).unbind(dim=3)
+        masked = [feature * m for m in masks]
+        masked.append(feature)
+
+        # 2nd Stage
+        # cat_source:(B, T, (spks+1)*F)
+        cat_source = torch.cat(masked, dim=2)
+        # cat_x:(B, T, spks*F)
+        cat_x, ilens, _ = self.enh_blstm(cat_source, ilens)
+        # z:(B, T, spks*F)
+        z = self.enh_linear(cat_x)
+        z = z.contiguous().view(B, T, F, self._num_spk)
+
+        enh_masks = torch.softmax(z, dim=3).unbind(dim=3)
+        enh_masked = [input * m for m in enh_masks]
+
+        others = OrderedDict(
+            zip(["mask_spk{}".format(i + 1) for i in range(len(enh_masks))], enh_masks)
+        )
+
+        return enh_masked, ilens, others
+
+    @property
+    def num_spk(self):
+        return self._num_spk
diff --git a/espnet2/enh/separator/dpcl_separator.py b/espnet2/enh/separator/dpcl_separator.py
new file mode 100644
index 00000000000..0eb0abf67e0
--- /dev/null
+++ b/espnet2/enh/separator/dpcl_separator.py
@@ -0,0 +1,142 @@
+from collections import OrderedDict
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+from torch_complex.tensor import ComplexTensor
+
+from espnet.nets.pytorch_backend.rnn.encoders import RNN
+from espnet2.enh.separator.abs_separator import AbsSeparator
+
+
+class DPCLSeparator(AbsSeparator):
+    def __init__(
+        self,
+        input_dim: int,
+        rnn_type: str = "blstm",
+        num_spk: int = 2,
+        nonlinear: str = "tanh",
+        layer: int = 2,
+        unit: int = 512,
+        emb_D: int = 40,
+        dropout: float = 0.0,
+    ):
+        """Deep Clustering Separator.
+
+        References:
+            [1] Deep clustering: Discriminative embeddings for segmentation and
+                separation; John R. Hershey. et al., 2016;
+                https://ieeexplore.ieee.org/document/7471631
+            [2] Manifold-Aware Deep Clustering: Maximizing Angles Between Embedding
+                Vectors Based on Regular Simplex; Tanaka, K. et al., 2021;
+                https://www.isca-speech.org/archive/interspeech_2021/tanaka21_interspeech.html
+
+        Args:
+            input_dim: input feature dimension
+            rnn_type: string, select from 'blstm', 'lstm' etc.
+            bidirectional: bool, whether the inter-chunk RNN layers are bidirectional.
+            num_spk: number of speakers
+            nonlinear: the nonlinear function for mask estimation,
+                       select from 'relu', 'tanh', 'sigmoid'
+            layer: int, number of stacked RNN layers. Default is 3.
+            unit: int, dimension of the hidden state.
+            emb_D: int, dimension of the feature vector for a tf-bin.
+            dropout: float, dropout ratio. Default is 0.
+        """  # noqa: E501
+        super().__init__()
+
+        self._num_spk = num_spk
+
+        self.blstm = RNN(
+            idim=input_dim,
+            elayers=layer,
+            cdim=unit,
+            hdim=unit,
+            dropout=dropout,
+            typ=rnn_type,
+        )
+
+        self.linear = torch.nn.Linear(unit, input_dim * emb_D)
+
+        if nonlinear not in ("sigmoid", "relu", "tanh"):
+            raise ValueError("Not supporting nonlinear={}".format(nonlinear))
+
+        self.nonlinear = {
+            "sigmoid": torch.nn.Sigmoid(),
+            "relu": torch.nn.ReLU(),
+            "tanh": torch.nn.Tanh(),
+        }[nonlinear]
+
+        self.D = emb_D
+
+    def forward(
+        self,
+        input: Union[torch.Tensor, ComplexTensor],
+        ilens: torch.Tensor,
+        additional: Optional[Dict] = None,
+    ) -> Tuple[List[Union[torch.Tensor, ComplexTensor]], torch.Tensor, OrderedDict]:
+        """Forward.
+
+        Args:
+            input (torch.Tensor or ComplexTensor): Encoded feature [B, T, F]
+            ilens (torch.Tensor): input lengths [Batch]
+            additional (Dict or None): other data included in model
+                NOTE: not used in this model
+
+        Returns:
+            masked (List[Union(torch.Tensor, ComplexTensor)]): [(B, T, N), ...]
+            ilens (torch.Tensor): (B,)
+            others predicted data, e.g. tf_embedding: OrderedDict[
+                'tf_embedding': learned embedding of all T-F bins (B, T * F, D),
+            ]
+        """
+        # if complex spectrum,
+        if isinstance(input, ComplexTensor):
+            feature = abs(input)
+        else:
+            feature = input
+        B, T, F = input.shape
+        # x:(B, T, F)
+        x, ilens, _ = self.blstm(feature, ilens)
+        # x:(B, T, F*D)
+        x = self.linear(x)
+        # x:(B, T, F*D)
+        x = self.nonlinear(x)
+        tf_embedding = x.view(B, -1, self.D)
+
+        if self.training:
+            masked = None
+        else:
+            # K-means for batch
+            centers = tf_embedding[:, : self._num_spk, :].detach()
+            dist = torch.empty(B, T * F, self._num_spk, device=tf_embedding.device)
+            last_label = torch.zeros(B, T * F, device=tf_embedding.device)
+            while True:
+                for i in range(self._num_spk):
+                    dist[:, :, i] = torch.sum(
+                        (tf_embedding - centers[:, i, :].unsqueeze(1)) ** 2, dim=2
+                    )
+                label = dist.argmin(dim=2)
+                if torch.sum(label != last_label) == 0:
+                    break
+                last_label = label
+                for b in range(B):
+                    for i in range(self._num_spk):
+                        centers[b, i] = tf_embedding[b, label[b] == i].mean(dim=0)
+            label = label.view(B, T, F)
+            masked = []
+            for i in range(self._num_spk):
+                masked.append(input * (label == i))
+
+        others = OrderedDict(
+            {"tf_embedding": tf_embedding},
+        )
+
+        return masked, ilens, others
+
+    @property
+    def num_spk(self):
+        return self._num_spk
diff --git a/espnet2/enh/separator/dprnn_separator.py b/espnet2/enh/separator/dprnn_separator.py
index 1492d725d4b..ddf0962b15d 100644
--- a/espnet2/enh/separator/dprnn_separator.py
+++ b/espnet2/enh/separator/dprnn_separator.py
@@ -1,6 +1,8 @@
 from collections import OrderedDict
 from distutils.version import LooseVersion
+from typing import Dict
 from typing import List
+from typing import Optional
 from typing import Tuple
 from typing import Union
 
@@ -70,13 +72,18 @@ def __init__(
         }[nonlinear]
 
     def forward(
-        self, input: Union[torch.Tensor, ComplexTensor], ilens: torch.Tensor
+        self,
+        input: Union[torch.Tensor, ComplexTensor],
+        ilens: torch.Tensor,
+        additional: Optional[Dict] = None,
     ) -> Tuple[List[Union[torch.Tensor, ComplexTensor]], torch.Tensor, OrderedDict]:
         """Forward.
 
         Args:
             input (torch.Tensor or ComplexTensor): Encoded feature [B, T, N]
             ilens (torch.Tensor): input lengths [Batch]
+            additional (Dict or None): other data included in model
+                NOTE: not used in this model
 
         Returns:
             masked (List[Union(torch.Tensor, ComplexTensor)]): [(B, T, N), ...]
diff --git a/espnet2/enh/separator/fasnet_separator.py b/espnet2/enh/separator/fasnet_separator.py
new file mode 100644
index 00000000000..20f83b80046
--- /dev/null
+++ b/espnet2/enh/separator/fasnet_separator.py
@@ -0,0 +1,114 @@
+from collections import OrderedDict
+from distutils.version import LooseVersion
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+
+import torch
+
+from espnet2.enh.layers.fasnet import FaSNet_TAC
+from espnet2.enh.layers.ifasnet import iFaSNet
+from espnet2.enh.separator.abs_separator import AbsSeparator
+
+
+is_torch_1_9_plus = LooseVersion(torch.__version__) >= LooseVersion("1.9.0")
+
+
+class FaSNetSeparator(AbsSeparator):
+    def __init__(
+        self,
+        input_dim: int,
+        enc_dim: int,
+        feature_dim: int,
+        hidden_dim: int,
+        layer: int,
+        segment_size: int,
+        num_spk: int,
+        win_len: int,
+        context_len: int,
+        fasnet_type: str,
+        dropout: float = 0.0,
+        sr: int = 16000,
+    ):
+        """Filter-and-sum Network (FaSNet) Separator
+
+        Args:
+            input_dim: required by AbsSeparator. Not used in this model.
+            enc_dim: encoder dimension
+            feature_dim: feature dimension
+            hidden_dim: hidden dimension in DPRNN
+            layer: number of DPRNN blocks in iFaSNet
+            segment_size: dual-path segment size
+            num_spk: number of speakers
+            win_len: window length in millisecond
+            context_len: context length in millisecond
+            fasnet_type: 'fasnet' or 'ifasnet'.
+                Select from origin fasnet or Implicit fasnet
+            dropout: dropout rate. Default is 0.
+            sr: samplerate of input audio
+        """
+        super().__init__()
+
+        self._num_spk = num_spk
+
+        assert fasnet_type in ["fasnet", "ifasnet"], "only support fasnet and ifasnet"
+
+        FASNET = FaSNet_TAC if fasnet_type == "fasnet" else iFaSNet
+
+        self.fasnet = FASNET(
+            enc_dim=enc_dim,
+            feature_dim=feature_dim,
+            hidden_dim=hidden_dim,
+            layer=layer,
+            segment_size=segment_size,
+            nspk=num_spk,
+            win_len=win_len,
+            context_len=context_len,
+            sr=sr,
+            dropout=dropout,
+        )
+
+    def forward(
+        self,
+        input: torch.Tensor,
+        ilens: torch.Tensor,
+        additional: Optional[Dict] = None,
+    ) -> Tuple[List[torch.Tensor], torch.Tensor, OrderedDict]:
+        """Forward.
+
+        Args:
+            input (torch.Tensor): (Batch, samples, channels)
+            ilens (torch.Tensor): input lengths [Batch]
+            additional (Dict or None): other data included in model
+                NOTE: not used in this model
+
+        Returns:
+            separated (List[Union(torch.Tensor, ComplexTensor)]): [(B, T, N), ...]
+            ilens (torch.Tensor): (B,)
+            others predicted data, e.g. masks: OrderedDict[
+                'mask_spk1': torch.Tensor(Batch, Frames, Freq),
+                'mask_spk2': torch.Tensor(Batch, Frames, Freq),
+                ...
+                'mask_spkn': torch.Tensor(Batch, Frames, Freq),
+            ]
+        """
+
+        assert input.dim() == 3, "only support input shape: (Batch, samples, channels)"
+        # currently only support for fixed-array
+
+        input = input.permute(0, 2, 1)
+
+        none_mic = torch.zeros(1, dtype=input.dtype)
+
+        separated = self.fasnet(input, none_mic)
+
+        separated = list(separated.unbind(dim=1))
+
+        others = {}
+
+        return separated, ilens, others
+
+    @property
+    def num_spk(self):
+        return self._num_spk
diff --git a/espnet2/enh/separator/neural_beamformer.py b/espnet2/enh/separator/neural_beamformer.py
index 0d236183772..dff26d6f66c 100644
--- a/espnet2/enh/separator/neural_beamformer.py
+++ b/espnet2/enh/separator/neural_beamformer.py
@@ -1,5 +1,7 @@
 from collections import OrderedDict
+from typing import Dict
 from typing import List
+from typing import Optional
 from typing import Tuple
 from typing import Union
 
@@ -125,7 +127,10 @@ def __init__(
         self.shared_power = shared_power and use_wpe
 
     def forward(
-        self, input: Union[torch.Tensor, ComplexTensor], ilens: torch.Tensor
+        self,
+        input: Union[torch.Tensor, ComplexTensor],
+        ilens: torch.Tensor,
+        additional: Optional[Dict] = None,
     ) -> Tuple[List[Union[torch.Tensor, ComplexTensor]], torch.Tensor, OrderedDict]:
         """Forward.
 
@@ -133,6 +138,8 @@ def forward(
             input (torch.complex64/ComplexTensor):
                 mixed speech [Batch, Frames, Channel, Freq]
             ilens (torch.Tensor): input lengths [Batch]
+            additional (Dict or None): other data included in model
+                NOTE: not used in this model
 
         Returns:
             enhanced speech (single-channel): List[torch.complex64/ComplexTensor]
diff --git a/espnet2/enh/separator/rnn_separator.py b/espnet2/enh/separator/rnn_separator.py
index 032f7e5f869..1294c0e1ffb 100644
--- a/espnet2/enh/separator/rnn_separator.py
+++ b/espnet2/enh/separator/rnn_separator.py
@@ -1,6 +1,8 @@
 from collections import OrderedDict
 from distutils.version import LooseVersion
+from typing import Dict
 from typing import List
+from typing import Optional
 from typing import Tuple
 from typing import Union
 
@@ -66,13 +68,18 @@ def __init__(
         }[nonlinear]
 
     def forward(
-        self, input: Union[torch.Tensor, ComplexTensor], ilens: torch.Tensor
+        self,
+        input: Union[torch.Tensor, ComplexTensor],
+        ilens: torch.Tensor,
+        additional: Optional[Dict] = None,
     ) -> Tuple[List[Union[torch.Tensor, ComplexTensor]], torch.Tensor, OrderedDict]:
         """Forward.
 
         Args:
             input (torch.Tensor or ComplexTensor): Encoded feature [B, T, N]
             ilens (torch.Tensor): input lengths [Batch]
+            additional (Dict or None): other data included in model
+                NOTE: not used in this model
 
         Returns:
             masked (List[Union(torch.Tensor, ComplexTensor)]): [(B, T, N), ...]
diff --git a/espnet2/enh/separator/skim_separator.py b/espnet2/enh/separator/skim_separator.py
new file mode 100644
index 00000000000..15dd467ea53
--- /dev/null
+++ b/espnet2/enh/separator/skim_separator.py
@@ -0,0 +1,132 @@
+from collections import OrderedDict
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+from torch_complex.tensor import ComplexTensor
+
+from espnet2.enh.layers.complex_utils import is_complex
+from espnet2.enh.layers.skim import SkiM
+from espnet2.enh.separator.abs_separator import AbsSeparator
+
+
+class SkiMSeparator(AbsSeparator):
+    """Skipping Memory (SkiM) Separator
+
+    Args:
+        input_dim: input feature dimension
+        causal: bool, whether the system is causal.
+        num_spk: number of target speakers.
+        nonlinear: the nonlinear function for mask estimation,
+            select from 'relu', 'tanh', 'sigmoid'
+        layer: int, number of SkiM blocks. Default is 3.
+        unit: int, dimension of the hidden state.
+        segment_size: segmentation size for splitting long features
+        dropout: float, dropout ratio. Default is 0.
+        mem_type: 'hc', 'h', 'c', 'id' or None.
+            It controls whether the hidden (or cell) state of
+            SegLSTM will be processed by MemLSTM.
+            In 'id' mode, both the hidden and cell states
+            will be identically returned.
+            When mem_type is None, the MemLSTM will be removed.
+        seg_overlap: Bool, whether the segmentation will reserve 50%
+            overlap for adjacent segments. Default is False.
+    """
+
+    def __init__(
+        self,
+        input_dim: int,
+        causal: bool = True,
+        num_spk: int = 2,
+        nonlinear: str = "relu",
+        layer: int = 3,
+        unit: int = 512,
+        segment_size: int = 20,
+        dropout: float = 0.0,
+        mem_type: str = "hc",
+        seg_overlap: bool = False,
+    ):
+
+        super().__init__()
+
+        self._num_spk = num_spk
+
+        self.segment_size = segment_size
+
+        if mem_type not in ("hc", "h", "c", "id", None):
+            raise ValueError("Not supporting mem_type={}".format(mem_type))
+
+        self.skim = SkiM(
+            input_size=input_dim,
+            hidden_size=unit,
+            output_size=input_dim * num_spk,
+            dropout=dropout,
+            num_blocks=layer,
+            bidirectional=(not causal),
+            norm_type="cLN" if causal else "gLN",
+            segment_size=segment_size,
+            seg_overlap=seg_overlap,
+            mem_type=mem_type,
+        )
+
+        if nonlinear not in ("sigmoid", "relu", "tanh"):
+            raise ValueError("Not supporting nonlinear={}".format(nonlinear))
+
+        self.nonlinear = {
+            "sigmoid": torch.nn.Sigmoid(),
+            "relu": torch.nn.ReLU(),
+            "tanh": torch.nn.Tanh(),
+        }[nonlinear]
+
+    def forward(
+        self,
+        input: Union[torch.Tensor, ComplexTensor],
+        ilens: torch.Tensor,
+        additional: Optional[Dict] = None,
+    ) -> Tuple[List[Union[torch.Tensor, ComplexTensor]], torch.Tensor, OrderedDict]:
+        """Forward.
+
+        Args:
+            input (torch.Tensor or ComplexTensor): Encoded feature [B, T, N]
+            ilens (torch.Tensor): input lengths [Batch]
+            additional (Dict or None): other data included in model
+                NOTE: not used in this model
+
+        Returns:
+            masked (List[Union(torch.Tensor, ComplexTensor)]): [(B, T, N), ...]
+            ilens (torch.Tensor): (B,)
+            others predicted data, e.g. masks: OrderedDict[
+                'mask_spk1': torch.Tensor(Batch, Frames, Freq),
+                'mask_spk2': torch.Tensor(Batch, Frames, Freq),
+                ...
+                'mask_spkn': torch.Tensor(Batch, Frames, Freq),
+            ]
+        """
+
+        # if complex spectrum,
+        if is_complex(input):
+            feature = abs(input)
+        else:
+            feature = input
+
+        B, T, N = feature.shape
+
+        processed = self.skim(feature)  # B,T, N
+
+        processed = processed.view(B, T, N, self.num_spk)
+        masks = self.nonlinear(processed).unbind(dim=3)
+
+        masked = [input * m for m in masks]
+
+        others = OrderedDict(
+            zip(["mask_spk{}".format(i + 1) for i in range(len(masks))], masks)
+        )
+
+        return masked, ilens, others
+
+    @property
+    def num_spk(self):
+        return self._num_spk
diff --git a/espnet2/enh/separator/svoice_separator.py b/espnet2/enh/separator/svoice_separator.py
new file mode 100644
index 00000000000..6a7fce40cba
--- /dev/null
+++ b/espnet2/enh/separator/svoice_separator.py
@@ -0,0 +1,203 @@
+from collections import OrderedDict
+import math
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from espnet2.enh.layers.dpmulcat import DPMulCat
+from espnet2.enh.layers.dprnn import merge_feature
+from espnet2.enh.layers.dprnn import split_feature
+from espnet2.enh.separator.abs_separator import AbsSeparator
+
+
+def overlap_and_add(signal, frame_step):
+    """Reconstructs a signal from a framed representation.
+
+        Adds potentially overlapping frames of a signal with shape
+        `[..., frames, frame_length]`, offsetting subsequent frames by `frame_step`.
+        The resulting tensor has shape `[..., output_size]` where
+            output_size = (frames - 1) * frame_step + frame_length
+
+        Args:
+            signal: A [..., frames, frame_length] Tensor. All dimensions may be unknown,
+                and rank must be at least 2.
+            frame_step: An integer denoting overlap offsets.
+                Must be less than or equal to frame_length.
+
+        Returns:
+            A Tensor with shape [..., output_size] containing the
+                overlap-added frames of signal's inner-most two dimensions.
+            output_size = (frames - 1) * frame_step + frame_length
+
+        Based on
+
+    https://github.com/tensorflow/tensorflow/blob/r1.12/tensorflow/contrib/signal/python/ops/reconstruction_ops.py
+    """
+    outer_dimensions = signal.size()[:-2]
+    frames, frame_length = signal.size()[-2:]
+
+    # gcd=Greatest Common Divisor
+    subframe_length = math.gcd(frame_length, frame_step)
+    subframe_step = frame_step // subframe_length
+    subframes_per_frame = frame_length // subframe_length
+    output_size = frame_step * (frames - 1) + frame_length
+    output_subframes = output_size // subframe_length
+
+    subframe_signal = signal.view(*outer_dimensions, -1, subframe_length)
+
+    frame = torch.arange(0, output_subframes).unfold(
+        0, subframes_per_frame, subframe_step
+    )
+    frame = frame.clone().detach().long().to(signal.device)
+    # frame = signal.new_tensor(frame).clone().long()  # signal may in GPU or CPU
+    frame = frame.contiguous().view(-1)
+
+    result = signal.new_zeros(*outer_dimensions, output_subframes, subframe_length)
+    result.index_add_(-2, frame, subframe_signal)
+    result = result.view(*outer_dimensions, -1)
+    return result
+
+
+class Encoder(nn.Module):
+    def __init__(self, enc_kernel_size: int, enc_feat_dim: int):
+        super().__init__()
+        # setting 50% overlap
+        self.conv = nn.Conv1d(
+            1,
+            enc_feat_dim,
+            kernel_size=enc_kernel_size,
+            stride=enc_kernel_size // 2,
+            bias=False,
+        )
+        self.nonlinear = nn.ReLU()
+
+    def forward(self, mixture):
+        mixture = torch.unsqueeze(mixture, 1)
+        mixture_w = self.nonlinear(self.conv(mixture))
+        return mixture_w
+
+
+class Decoder(nn.Module):
+    def __init__(self, kernel_size):
+        super().__init__()
+        self.kernel_size = kernel_size
+
+    def forward(self, est_source):
+        est_source = torch.transpose(est_source, 2, 3)
+        est_source = nn.AvgPool2d((1, self.kernel_size))(est_source)
+        est_source = overlap_and_add(est_source, self.kernel_size // 2)
+
+        return est_source
+
+
+class SVoiceSeparator(AbsSeparator):
+    """SVoice model for speech separation.
+
+    Reference:
+        Voice Separation with an Unknown Number of Multiple Speakers;
+        E. Nachmani et al., 2020;
+        https://arxiv.org/abs/2003.01531
+
+    Args:
+        enc_dim: int, dimension of the encoder module's output. (Default: 128)
+        kernel_size: int, the kernel size of Conv1D layer in both encoder and
+            decoder modules. (Default: 8)
+        hidden_size: int, dimension of the hidden state in RNN layers. (Default: 128)
+        num_spk: int, the number of speakers in the output. (Default: 2)
+        num_layers: int, number of stacked MulCat blocks. (Default: 4)
+        segment_size: dual-path segment size. (Default: 20)
+        bidirectional: bool, whether the RNN layers are bidirectional. (Default: True)
+        input_normalize: bool, whether to apply GroupNorm on the input Tensor.
+            (Default: False)
+    """
+
+    def __init__(
+        self,
+        input_dim: int,
+        enc_dim: int,
+        kernel_size: int,
+        hidden_size: int,
+        num_spk: int = 2,
+        num_layers: int = 4,
+        segment_size: int = 20,
+        bidirectional: bool = True,
+        input_normalize: bool = False,
+    ):
+        super().__init__()
+
+        self._num_spk = num_spk
+        self.enc_dim = enc_dim
+        self.segment_size = segment_size
+        # model sub-networks
+        self.encoder = Encoder(kernel_size, enc_dim)
+        self.decoder = Decoder(kernel_size)
+        self.rnn_model = DPMulCat(
+            input_size=enc_dim,
+            hidden_size=hidden_size,
+            output_size=enc_dim,
+            num_spk=num_spk,
+            num_layers=num_layers,
+            bidirectional=bidirectional,
+            input_normalize=input_normalize,
+        )
+
+    def forward(
+        self,
+        input: torch.Tensor,
+        ilens: torch.Tensor,
+        additional: Optional[Dict] = None,
+    ) -> Tuple[List[torch.Tensor], torch.Tensor, OrderedDict]:
+        """Forward.
+
+        Args:
+            input (torch.Tensor or ComplexTensor): Encoded feature [B, T, N]
+            ilens (torch.Tensor): input lengths [Batch]
+            additional (Dict or None): other data included in model
+                NOTE: not used in this model
+
+        Returns:
+            masked (List[Union(torch.Tensor, ComplexTensor)]): [(B, T, N), ...]
+            ilens (torch.Tensor): (B,)
+            others predicted data, e.g. masks: OrderedDict[
+                'mask_spk1': torch.Tensor(Batch, Frames, Freq),
+                'mask_spk2': torch.Tensor(Batch, Frames, Freq),
+                ...
+                'mask_spkn': torch.Tensor(Batch, Frames, Freq),
+            ]
+        """
+        # fix time dimension, might change due to convolution operations
+        T_mix = input.size(-1)
+
+        mixture_w = self.encoder(input)
+
+        enc_segments, enc_rest = split_feature(mixture_w, self.segment_size)
+        # separate
+        output_all = self.rnn_model(enc_segments)
+
+        # generate wav after each RNN block and optimize the loss
+        outputs = []
+        for ii in range(len(output_all)):
+            output_ii = merge_feature(output_all[ii], enc_rest)
+            output_ii = output_ii.view(
+                input.shape[0], self._num_spk, self.enc_dim, mixture_w.shape[2]
+            )
+            output_ii = self.decoder(output_ii)
+            T_est = output_ii.size(-1)
+            output_ii = F.pad(output_ii, (0, T_mix - T_est))
+            output_ii = list(output_ii.unbind(dim=1))
+            if self.training:
+                outputs.append(output_ii)
+            else:
+                outputs = output_ii
+
+        others = {}
+        return outputs, ilens, others
+
+    @property
+    def num_spk(self):
+        return self._num_spk
diff --git a/espnet2/enh/separator/tcn_separator.py b/espnet2/enh/separator/tcn_separator.py
index 56f7e053e01..8d82103ec5d 100644
--- a/espnet2/enh/separator/tcn_separator.py
+++ b/espnet2/enh/separator/tcn_separator.py
@@ -1,6 +1,8 @@
 from collections import OrderedDict
 from distutils.version import LooseVersion
+from typing import Dict
 from typing import List
+from typing import Optional
 from typing import Tuple
 from typing import Union
 
@@ -65,13 +67,18 @@ def __init__(
         )
 
     def forward(
-        self, input: Union[torch.Tensor, ComplexTensor], ilens: torch.Tensor
+        self,
+        input: Union[torch.Tensor, ComplexTensor],
+        ilens: torch.Tensor,
+        additional: Optional[Dict] = None,
     ) -> Tuple[List[Union[torch.Tensor, ComplexTensor]], torch.Tensor, OrderedDict]:
         """Forward.
 
         Args:
             input (torch.Tensor or ComplexTensor): Encoded feature [B, T, N]
             ilens (torch.Tensor): input lengths [Batch]
+            additional (Dict or None): other data included in model
+                NOTE: not used in this model
 
         Returns:
             masked (List[Union(torch.Tensor, ComplexTensor)]): [(B, T, N), ...]
diff --git a/espnet2/enh/separator/transformer_separator.py b/espnet2/enh/separator/transformer_separator.py
index 346410e699e..ca4421221e7 100644
--- a/espnet2/enh/separator/transformer_separator.py
+++ b/espnet2/enh/separator/transformer_separator.py
@@ -1,6 +1,8 @@
 from collections import OrderedDict
 from distutils.version import LooseVersion
+from typing import Dict
 from typing import List
+from typing import Optional
 from typing import Tuple
 from typing import Union
 
@@ -105,13 +107,18 @@ def __init__(
         }[nonlinear]
 
     def forward(
-        self, input: Union[torch.Tensor, ComplexTensor], ilens: torch.Tensor
+        self,
+        input: Union[torch.Tensor, ComplexTensor],
+        ilens: torch.Tensor,
+        additional: Optional[Dict] = None,
     ) -> Tuple[List[Union[torch.Tensor, ComplexTensor]], torch.Tensor, OrderedDict]:
         """Forward.
 
         Args:
             input (torch.Tensor or ComplexTensor): Encoded feature [B, T, N]
             ilens (torch.Tensor): input lengths [Batch]
+            additional (Dict or None): other data included in model
+                NOTE: not used in this model
 
         Returns:
             masked (List[Union(torch.Tensor, ComplexTensor)]): [(B, T, N), ...]
diff --git a/espnet2/gan_tts/espnet_model.py b/espnet2/gan_tts/espnet_model.py
index cbb39cc682b..34ca845f0fd 100644
--- a/espnet2/gan_tts/espnet_model.py
+++ b/espnet2/gan_tts/espnet_model.py
@@ -74,6 +74,7 @@ def forward(
         sids: Optional[torch.Tensor] = None,
         lids: Optional[torch.Tensor] = None,
         forward_generator: bool = True,
+        **kwargs,
     ) -> Dict[str, Any]:
         """Return generator or discriminator loss with dict format.
 
@@ -92,6 +93,7 @@ def forward(
             sids (Optional[Tensor]): Speaker ID tensor (B, 1).
             lids (Optional[Tensor]): Language ID tensor (B, 1).
             forward_generator (bool): Whether to forward generator.
+            kwargs: "utt_id" is among the input.
 
         Returns:
             Dict[str, Any]:
@@ -176,6 +178,7 @@ def collect_feats(
         spembs: Optional[torch.Tensor] = None,
         sids: Optional[torch.Tensor] = None,
         lids: Optional[torch.Tensor] = None,
+        **kwargs,
     ) -> Dict[str, torch.Tensor]:
         """Calculate features and return them as a dict.
 
diff --git a/espnet2/gan_tts/hifigan/hifigan.py b/espnet2/gan_tts/hifigan/hifigan.py
index 96bd5262dcf..516678366b1 100644
--- a/espnet2/gan_tts/hifigan/hifigan.py
+++ b/espnet2/gan_tts/hifigan/hifigan.py
@@ -93,7 +93,7 @@ def __init__(
                         **nonlinear_activation_params
                     ),
                     torch.nn.ConvTranspose1d(
-                        channels // (2 ** i),
+                        channels // (2**i),
                         channels // (2 ** (i + 1)),
                         upsample_kernel_sizes[i],
                         upsample_scales[i],
diff --git a/espnet2/gan_tts/melgan/melgan.py b/espnet2/gan_tts/melgan/melgan.py
index b9f57389167..7b1281d14fd 100644
--- a/espnet2/gan_tts/melgan/melgan.py
+++ b/espnet2/gan_tts/melgan/melgan.py
@@ -84,7 +84,7 @@ def __init__(
             ]
             layers += [
                 torch.nn.ConvTranspose1d(
-                    channels // (2 ** i),
+                    channels // (2**i),
                     channels // (2 ** (i + 1)),
                     upsample_scale * 2,
                     stride=upsample_scale,
@@ -100,7 +100,7 @@ def __init__(
                     ResidualStack(
                         kernel_size=stack_kernel_size,
                         channels=channels // (2 ** (i + 1)),
-                        dilation=stack_kernel_size ** j,
+                        dilation=stack_kernel_size**j,
                         bias=bias,
                         nonlinear_activation=nonlinear_activation,
                         nonlinear_activation_params=nonlinear_activation_params,
diff --git a/espnet2/gan_tts/parallel_wavegan/parallel_wavegan.py b/espnet2/gan_tts/parallel_wavegan/parallel_wavegan.py
index 089708fb5f1..85b9ac224ae 100644
--- a/espnet2/gan_tts/parallel_wavegan/parallel_wavegan.py
+++ b/espnet2/gan_tts/parallel_wavegan/parallel_wavegan.py
@@ -196,7 +196,7 @@ def _apply_weight_norm(m: torch.nn.Module):
 
     @staticmethod
     def _get_receptive_field_size(
-        layers, stacks, kernel_size, dilation=lambda x: 2 ** x
+        layers, stacks, kernel_size, dilation=lambda x: 2**x
     ):
         assert layers % stacks == 0
         layers_per_cycle = layers // stacks
@@ -289,7 +289,7 @@ def __init__(
             if i == 0:
                 dilation = 1
             else:
-                dilation = i if dilation_factor == 1 else dilation_factor ** i
+                dilation = i if dilation_factor == 1 else dilation_factor**i
                 conv_in_channels = conv_channels
             padding = (kernel_size - 1) // 2 * dilation
             conv_layer = [
diff --git a/espnet2/gan_tts/vits/duration_predictor.py b/espnet2/gan_tts/vits/duration_predictor.py
index cbddaecdb65..5a480b11344 100644
--- a/espnet2/gan_tts/vits/duration_predictor.py
+++ b/espnet2/gan_tts/vits/duration_predictor.py
@@ -157,7 +157,7 @@ def forward(
                 (F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1, 2]
             )
             logq = (
-                torch.sum(-0.5 * (math.log(2 * math.pi) + (e_q ** 2)) * x_mask, [1, 2])
+                torch.sum(-0.5 * (math.log(2 * math.pi) + (e_q**2)) * x_mask, [1, 2])
                 - logdet_tot_q
             )
 
@@ -169,7 +169,7 @@ def forward(
                 z, logdet = flow(z, x_mask, g=x, inverse=inverse)
                 logdet_tot = logdet_tot + logdet
             nll = (
-                torch.sum(0.5 * (math.log(2 * math.pi) + (z ** 2)) * x_mask, [1, 2])
+                torch.sum(0.5 * (math.log(2 * math.pi) + (z**2)) * x_mask, [1, 2])
                 - logdet_tot
             )
             return nll + logq  # (B,)
diff --git a/espnet2/gan_tts/vits/flow.py b/espnet2/gan_tts/vits/flow.py
index 2d37f5c9fa2..ef384df3802 100644
--- a/espnet2/gan_tts/vits/flow.py
+++ b/espnet2/gan_tts/vits/flow.py
@@ -155,7 +155,7 @@ def __init__(
 
         self.convs = torch.nn.ModuleList()
         for i in range(layers):
-            dilation = kernel_size ** i
+            dilation = kernel_size**i
             padding = (kernel_size * dilation - dilation) // 2
             self.convs += [
                 torch.nn.Sequential(
diff --git a/espnet2/gan_tts/vits/generator.py b/espnet2/gan_tts/vits/generator.py
index 765364cea6a..4907dbd6162 100644
--- a/espnet2/gan_tts/vits/generator.py
+++ b/espnet2/gan_tts/vits/generator.py
@@ -352,7 +352,7 @@ def forward(
             )
             # (B, T_feats, H) x (B, H, T_text) = (B, T_feats, T_text)
             neg_x_ent_2 = torch.matmul(
-                -0.5 * (z_p ** 2).transpose(1, 2),
+                -0.5 * (z_p**2).transpose(1, 2),
                 s_p_sq_r,
             )
             # (B, T_feats, H) x (B, H, T_text) = (B, T_feats, T_text)
@@ -362,7 +362,7 @@ def forward(
             )
             # (B, 1, T_text)
             neg_x_ent_4 = torch.sum(
-                -0.5 * (m_p ** 2) * s_p_sq_r,
+                -0.5 * (m_p**2) * s_p_sq_r,
                 [1],
                 keepdim=True,
             )
@@ -489,7 +489,7 @@ def inference(
             )
             # (B, T_feats, H) x (B, H, T_text) = (B, T_feats, T_text)
             neg_x_ent_2 = torch.matmul(
-                -0.5 * (z_p ** 2).transpose(1, 2),
+                -0.5 * (z_p**2).transpose(1, 2),
                 s_p_sq_r,
             )
             # (B, T_feats, H) x (B, H, T_text) = (B, T_feats, T_text)
@@ -499,7 +499,7 @@ def inference(
             )
             # (B, 1, T_text)
             neg_x_ent_4 = torch.sum(
-                -0.5 * (m_p ** 2) * s_p_sq_r,
+                -0.5 * (m_p**2) * s_p_sq_r,
                 [1],
                 keepdim=True,
             )
diff --git a/espnet2/gan_tts/vits/text_encoder.py b/espnet2/gan_tts/vits/text_encoder.py
index 8c42553784e..6e529081d57 100644
--- a/espnet2/gan_tts/vits/text_encoder.py
+++ b/espnet2/gan_tts/vits/text_encoder.py
@@ -79,7 +79,7 @@ def __init__(
 
         # define modules
         self.emb = torch.nn.Embedding(vocabs, attention_dim)
-        torch.nn.init.normal_(self.emb.weight, 0.0, attention_dim ** -0.5)
+        torch.nn.init.normal_(self.emb.weight, 0.0, attention_dim**-0.5)
         self.encoder = Encoder(
             idim=-1,
             input_layer=None,
diff --git a/espnet2/hubert/espnet_model.py b/espnet2/hubert/espnet_model.py
index bc5bd451bfd..4fa775841bc 100644
--- a/espnet2/hubert/espnet_model.py
+++ b/espnet2/hubert/espnet_model.py
@@ -97,6 +97,7 @@ def forward(
         speech_lengths: torch.Tensor,
         text: torch.Tensor,
         text_lengths: torch.Tensor,
+        **kwargs,
     ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
         """Frontend + Encoder + Calc loss
 
@@ -105,6 +106,7 @@ def forward(
             speech_lengths: (Batch, )
             text: (Batch, Length)
             text_lengths: (Batch,)
+            kwargs: "utt_id" is among the input.
         """
         assert text_lengths.dim() == 1, text_lengths.shape
         # Check that batch_size is unified
@@ -144,6 +146,7 @@ def collect_feats(
         speech_lengths: torch.Tensor,
         text: torch.Tensor,
         text_lengths: torch.Tensor,
+        **kwargs,
     ) -> Dict[str, torch.Tensor]:
         feats, feats_lengths = self._extract_feats(speech, speech_lengths)
         return {"feats": feats, "feats_lengths": feats_lengths}
diff --git a/espnet2/layers/mask_along_axis.py b/espnet2/layers/mask_along_axis.py
index e25e10ba57e..ecff6fa9659 100644
--- a/espnet2/layers/mask_along_axis.py
+++ b/espnet2/layers/mask_along_axis.py
@@ -1,3 +1,4 @@
+import math
 import torch
 from typeguard import check_argument_types
 from typing import Sequence
@@ -126,3 +127,78 @@ def forward(self, spec: torch.Tensor, spec_lengths: torch.Tensor = None):
             num_mask=self.num_mask,
             replace_with_zero=self.replace_with_zero,
         )
+
+
+class MaskAlongAxisVariableMaxWidth(torch.nn.Module):
+    """Mask input spec along a specified axis with variable maximum width.
+
+    Formula:
+        max_width = max_width_ratio * seq_len
+    """
+
+    def __init__(
+        self,
+        mask_width_ratio_range: Union[float, Sequence[float]] = (0.0, 0.05),
+        num_mask: int = 2,
+        dim: Union[int, str] = "time",
+        replace_with_zero: bool = True,
+    ):
+        assert check_argument_types()
+        if isinstance(mask_width_ratio_range, float):
+            mask_width_ratio_range = (0.0, mask_width_ratio_range)
+        if len(mask_width_ratio_range) != 2:
+            raise TypeError(
+                f"mask_width_ratio_range must be a tuple of float and float values: "
+                f"{mask_width_ratio_range}",
+            )
+
+        assert mask_width_ratio_range[1] > mask_width_ratio_range[0]
+        if isinstance(dim, str):
+            if dim == "time":
+                dim = 1
+            elif dim == "freq":
+                dim = 2
+            else:
+                raise ValueError("dim must be int, 'time' or 'freq'")
+        if dim == 1:
+            self.mask_axis = "time"
+        elif dim == 2:
+            self.mask_axis = "freq"
+        else:
+            self.mask_axis = "unknown"
+
+        super().__init__()
+        self.mask_width_ratio_range = mask_width_ratio_range
+        self.num_mask = num_mask
+        self.dim = dim
+        self.replace_with_zero = replace_with_zero
+
+    def extra_repr(self):
+        return (
+            f"mask_width_ratio_range={self.mask_width_ratio_range}, "
+            f"num_mask={self.num_mask}, axis={self.mask_axis}"
+        )
+
+    def forward(self, spec: torch.Tensor, spec_lengths: torch.Tensor = None):
+        """Forward function.
+
+        Args:
+            spec: (Batch, Length, Freq)
+        """
+
+        max_seq_len = spec.shape[self.dim]
+        min_mask_width = math.floor(max_seq_len * self.mask_width_ratio_range[0])
+        min_mask_width = max([0, min_mask_width])
+        max_mask_width = math.floor(max_seq_len * self.mask_width_ratio_range[1])
+        max_mask_width = min([max_seq_len, max_mask_width])
+
+        if max_mask_width > min_mask_width:
+            return mask_along_axis(
+                spec,
+                spec_lengths,
+                mask_width_range=(min_mask_width, max_mask_width),
+                dim=self.dim,
+                num_mask=self.num_mask,
+                replace_with_zero=self.replace_with_zero,
+            )
+        return spec, spec_lengths
diff --git a/espnet2/layers/utterance_mvn.py b/espnet2/layers/utterance_mvn.py
index a41f869f322..4f1adb3e53b 100644
--- a/espnet2/layers/utterance_mvn.py
+++ b/espnet2/layers/utterance_mvn.py
@@ -76,7 +76,7 @@ def utterance_mvn(
         if norm_vars:
             var = x.pow(2).sum(dim=1, keepdim=True) / ilens_
             std = torch.clamp(var.sqrt(), min=eps)
-            x = x / std.sqrt()
+            x = x / std
         return x, ilens
     else:
         if norm_vars:
diff --git a/espnet2/lm/espnet_model.py b/espnet2/lm/espnet_model.py
index 0309ee4ffb0..de6cd114a25 100644
--- a/espnet2/lm/espnet_model.py
+++ b/espnet2/lm/espnet_model.py
@@ -114,7 +114,10 @@ def batchify_nll(
         return nll, x_lengths
 
     def forward(
-        self, text: torch.Tensor, text_lengths: torch.Tensor
+        self,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+        **kwargs,
     ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
         nll, y_lengths = self.nll(text, text_lengths)
         ntokens = y_lengths.sum()
@@ -126,6 +129,9 @@ def forward(
         return loss, stats, weight
 
     def collect_feats(
-        self, text: torch.Tensor, text_lengths: torch.Tensor
+        self,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+        **kwargs,
     ) -> Dict[str, torch.Tensor]:
         return {}
diff --git a/espnet2/main_funcs/average_nbest_models.py b/espnet2/main_funcs/average_nbest_models.py
index e025238e80e..4c278e23823 100644
--- a/espnet2/main_funcs/average_nbest_models.py
+++ b/espnet2/main_funcs/average_nbest_models.py
@@ -1,5 +1,6 @@
 import logging
 from pathlib import Path
+from typing import Optional
 from typing import Sequence
 from typing import Union
 import warnings
@@ -17,6 +18,7 @@ def average_nbest_models(
     reporter: Reporter,
     best_model_criterion: Sequence[Sequence[str]],
     nbest: Union[Collection[int], int],
+    suffix: Optional[str] = None,
 ) -> None:
     """Generate averaged model from n-best models
 
@@ -25,7 +27,8 @@ def average_nbest_models(
         reporter: Reporter instance
         best_model_criterion: Give criterions to decide the best model.
             e.g. [("valid", "loss", "min"), ("train", "acc", "max")]
-        nbest:
+        nbest: Number of best model files to be averaged
+        suffix: A suffix added to the averaged model file name
     """
     assert check_argument_types()
     if isinstance(nbest, int):
@@ -35,6 +38,11 @@ def average_nbest_models(
     if len(nbests) == 0:
         warnings.warn("At least 1 nbest values are required")
         nbests = [1]
+    if suffix is not None:
+        suffix = suffix + "."
+    else:
+        suffix = ""
+
     # 1. Get nbests: List[Tuple[str, str, List[Tuple[epoch, value]]]]
     nbest_epochs = [
         (ph, k, reporter.sort_epochs_and_values(ph, k, m)[: max(nbests)])
@@ -55,12 +63,12 @@ def average_nbest_models(
                 # The averaged model is same as the best model
                 e, _ = epoch_and_values[0]
                 op = output_dir / f"{e}epoch.pth"
-                sym_op = output_dir / f"{ph}.{cr}.ave_1best.pth"
+                sym_op = output_dir / f"{ph}.{cr}.ave_1best.{suffix}pth"
                 if sym_op.is_symlink() or sym_op.exists():
                     sym_op.unlink()
                 sym_op.symlink_to(op.name)
             else:
-                op = output_dir / f"{ph}.{cr}.ave_{n}best.pth"
+                op = output_dir / f"{ph}.{cr}.ave_{n}best.{suffix}pth"
                 logging.info(
                     f"Averaging {n}best models: " f'criterion="{ph}.{cr}": {op}'
                 )
@@ -96,8 +104,8 @@ def average_nbest_models(
                 torch.save(avg, op)
 
         # 3. *.*.ave.pth is a symlink to the max ave model
-        op = output_dir / f"{ph}.{cr}.ave_{max(_nbests)}best.pth"
-        sym_op = output_dir / f"{ph}.{cr}.ave.pth"
+        op = output_dir / f"{ph}.{cr}.ave_{max(_nbests)}best.{suffix}pth"
+        sym_op = output_dir / f"{ph}.{cr}.ave.{suffix}pth"
         if sym_op.is_symlink() or sym_op.exists():
             sym_op.unlink()
         sym_op.symlink_to(op.name)
diff --git a/espnet2/main_funcs/calculate_all_attentions.py b/espnet2/main_funcs/calculate_all_attentions.py
index ed53d2b89c5..52fe045779b 100644
--- a/espnet2/main_funcs/calculate_all_attentions.py
+++ b/espnet2/main_funcs/calculate_all_attentions.py
@@ -107,7 +107,7 @@ def hook(module, input, output, name=name):
     # Batch-mode can't be used to keep requirements small for each models.
     keys = []
     for k in batch:
-        if not k.endswith("_lengths"):
+        if not (k.endswith("_lengths") or k in ["utt_id"]):
             keys.append(k)
 
     return_dict = defaultdict(list)
@@ -128,6 +128,10 @@ def hook(module, input, output, name=name):
                 if k + "_lengths" in batch
             }
         )
+
+        if "utt_id" in batch:
+            _sample["utt_id"] = batch["utt_id"]
+
         model(**_sample)
 
         # Derive the attention results
diff --git a/espnet2/main_funcs/collect_stats.py b/espnet2/main_funcs/collect_stats.py
index 9916ae650d7..297f7bfda7f 100644
--- a/espnet2/main_funcs/collect_stats.py
+++ b/espnet2/main_funcs/collect_stats.py
@@ -92,7 +92,7 @@ def collect_stats(
                             seq = seq[None]
                         # Accumulate value, its square, and count
                         sum_dict[key] += seq.sum(0)
-                        sq_dict[key] += (seq ** 2).sum(0)
+                        sq_dict[key] += (seq**2).sum(0)
                         count_dict[key] += len(seq)
 
                         # 4. [Option] Write derived features as npy format file.
diff --git a/espnet2/mt/espnet_model.py b/espnet2/mt/espnet_model.py
new file mode 100644
index 00000000000..953d5bc02f8
--- /dev/null
+++ b/espnet2/mt/espnet_model.py
@@ -0,0 +1,283 @@
+from contextlib import contextmanager
+from distutils.version import LooseVersion
+import logging
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+from typeguard import check_argument_types
+
+from espnet.nets.e2e_mt_common import ErrorCalculator as MTErrorCalculator
+from espnet.nets.pytorch_backend.nets_utils import th_accuracy
+from espnet.nets.pytorch_backend.transformer.add_sos_eos import add_sos_eos
+from espnet.nets.pytorch_backend.transformer.label_smoothing_loss import (
+    LabelSmoothingLoss,  # noqa: H301
+)
+from espnet2.asr.decoder.abs_decoder import AbsDecoder
+from espnet2.asr.encoder.abs_encoder import AbsEncoder
+from espnet2.asr.frontend.abs_frontend import AbsFrontend
+from espnet2.asr.postencoder.abs_postencoder import AbsPostEncoder
+from espnet2.asr.preencoder.abs_preencoder import AbsPreEncoder
+from espnet2.torch_utils.device_funcs import force_gatherable
+from espnet2.train.abs_espnet_model import AbsESPnetModel
+
+if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"):
+    from torch.cuda.amp import autocast
+else:
+    # Nothing to do if torch<1.6.0
+    @contextmanager
+    def autocast(enabled=True):
+        yield
+
+
+class ESPnetMTModel(AbsESPnetModel):
+    """Encoder-Decoder model"""
+
+    def __init__(
+        self,
+        vocab_size: int,
+        token_list: Union[Tuple[str, ...], List[str]],
+        frontend: Optional[AbsFrontend],
+        preencoder: Optional[AbsPreEncoder],
+        encoder: AbsEncoder,
+        postencoder: Optional[AbsPostEncoder],
+        decoder: AbsDecoder,
+        src_vocab_size: int = 0,
+        src_token_list: Union[Tuple[str, ...], List[str]] = [],
+        ignore_id: int = -1,
+        lsm_weight: float = 0.0,
+        length_normalized_loss: bool = False,
+        report_bleu: bool = True,
+        sym_space: str = "<space>",
+        sym_blank: str = "<blank>",
+        extract_feats_in_collect_stats: bool = True,
+        share_decoder_input_output_embed: bool = False,
+        share_encoder_decoder_input_embed: bool = False,
+    ):
+        assert check_argument_types()
+
+        super().__init__()
+        # note that eos is the same as sos (equivalent ID)
+        self.sos = vocab_size - 1
+        self.eos = vocab_size - 1
+        self.vocab_size = vocab_size
+        self.src_vocab_size = src_vocab_size
+        self.ignore_id = ignore_id
+        self.token_list = token_list.copy()
+
+        if share_decoder_input_output_embed:
+            if decoder.output_layer is not None:
+                decoder.output_layer.weight = decoder.embed[0].weight
+                logging.info(
+                    "Decoder input embedding and output linear layer are shared"
+                )
+            else:
+                logging.warning(
+                    "Decoder has no output layer, so it cannot be shared "
+                    "with input embedding"
+                )
+
+        if share_encoder_decoder_input_embed:
+            if src_vocab_size == vocab_size:
+                frontend.embed[0].weight = decoder.embed[0].weight
+                logging.info("Encoder and decoder input embeddings are shared")
+            else:
+                logging.warning(
+                    f"src_vocab_size ({src_vocab_size}) does not equal tgt_vocab_size"
+                    f" ({vocab_size}), so the encoder and decoder input embeddings "
+                    "cannot be shared"
+                )
+
+        self.frontend = frontend
+        self.preencoder = preencoder
+        self.postencoder = postencoder
+        self.encoder = encoder
+        self.decoder = decoder
+
+        self.criterion_mt = LabelSmoothingLoss(
+            size=vocab_size,
+            padding_idx=ignore_id,
+            smoothing=lsm_weight,
+            normalize_length=length_normalized_loss,
+        )
+
+        # MT error calculator
+        if report_bleu:
+            self.mt_error_calculator = MTErrorCalculator(
+                token_list, sym_space, sym_blank, report_bleu
+            )
+        else:
+            self.mt_error_calculator = None
+
+        self.extract_feats_in_collect_stats = extract_feats_in_collect_stats
+
+    def forward(
+        self,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+        src_text: torch.Tensor,
+        src_text_lengths: torch.Tensor,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
+        """Frontend + Encoder + Decoder + Calc loss
+
+        Args:
+            text: (Batch, Length)
+            text_lengths: (Batch,)
+            src_text: (Batch, length)
+            src_text_lengths: (Batch,)
+            kwargs: "utt_id" is among the input.
+        """
+        assert text_lengths.dim() == 1, text_lengths.shape
+        # Check that batch_size is unified
+        assert (
+            text.shape[0]
+            == text_lengths.shape[0]
+            == src_text.shape[0]
+            == src_text_lengths.shape[0]
+        ), (text.shape, text_lengths.shape, src_text.shape, src_text_lengths.shape)
+
+        batch_size = src_text.shape[0]
+
+        # for data-parallel
+        text = text[:, : text_lengths.max()]
+        src_text = src_text[:, : src_text_lengths.max()]
+
+        # 1. Encoder
+        encoder_out, encoder_out_lens = self.encode(src_text, src_text_lengths)
+
+        # 2a. Attention-decoder branch (MT)
+        loss_mt_att, acc_mt_att, bleu_mt_att = self._calc_mt_att_loss(
+            encoder_out, encoder_out_lens, text, text_lengths
+        )
+
+        # 3. Loss computation
+        loss = loss_mt_att
+
+        stats = dict(
+            loss=loss.detach(),
+            acc=acc_mt_att,
+            bleu=bleu_mt_att,
+        )
+
+        # force_gatherable: to-device and to-tensor if scalar for DataParallel
+        loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
+        return loss, stats, weight
+
+    def collect_feats(
+        self,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+        src_text: torch.Tensor,
+        src_text_lengths: torch.Tensor,
+        **kwargs,
+    ) -> Dict[str, torch.Tensor]:
+        if self.extract_feats_in_collect_stats:
+            feats, feats_lengths = self._extract_feats(src_text, src_text_lengths)
+        else:
+            # Generate dummy stats if extract_feats_in_collect_stats is False
+            logging.warning(
+                "Generating dummy stats for feats and feats_lengths, "
+                "because encoder_conf.extract_feats_in_collect_stats is "
+                f"{self.extract_feats_in_collect_stats}"
+            )
+            feats, feats_lengths = src_text, src_text_lengths
+        return {"feats": feats, "feats_lengths": feats_lengths}
+
+    def encode(
+        self, src_text: torch.Tensor, src_text_lengths: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Frontend + Encoder. Note that this method is used by mt_inference.py
+
+        Args:
+            src_text: (Batch, Length, ...)
+            src_text_lengths: (Batch, )
+        """
+        with autocast(False):
+            # 1. Extract feats
+            feats, feats_lengths = self._extract_feats(src_text, src_text_lengths)
+
+            # 2. Data augmentation
+            # if self.specaug is not None and self.training:
+            #     feats, feats_lengths = self.specaug(feats, feats_lengths)
+
+        # Pre-encoder, e.g. used for raw input data
+        if self.preencoder is not None:
+            feats, feats_lengths = self.preencoder(feats, feats_lengths)
+
+        # 4. Forward encoder
+        # feats: (Batch, Length, Dim)
+        # -> encoder_out: (Batch, Length2, Dim2)
+        encoder_out, encoder_out_lens, _ = self.encoder(feats, feats_lengths)
+
+        # Post-encoder, e.g. NLU
+        if self.postencoder is not None:
+            encoder_out, encoder_out_lens = self.postencoder(
+                encoder_out, encoder_out_lens
+            )
+
+        assert encoder_out.size(0) == src_text.size(0), (
+            encoder_out.size(),
+            src_text.size(0),
+        )
+        assert encoder_out.size(1) <= encoder_out_lens.max(), (
+            encoder_out.size(),
+            encoder_out_lens.max(),
+        )
+
+        return encoder_out, encoder_out_lens
+
+    def _extract_feats(
+        self, src_text: torch.Tensor, src_text_lengths: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        assert src_text_lengths.dim() == 1, src_text_lengths.shape
+
+        # for data-parallel
+        src_text = src_text[:, : src_text_lengths.max()]
+        src_text, _ = add_sos_eos(src_text, self.sos, self.eos, self.ignore_id)
+        src_text_lengths = src_text_lengths + 1
+
+        if self.frontend is not None:
+            # Frontend
+            #  e.g. Embedding Lookup
+            # src_text (Batch, NSamples) -> feats: (Batch, NSamples, Dim)
+            feats, feats_lengths = self.frontend(src_text, src_text_lengths)
+        else:
+            # No frontend and no feature extract
+            feats, feats_lengths = src_text, src_text_lengths
+        return feats, feats_lengths
+
+    def _calc_mt_att_loss(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_out_lens: torch.Tensor,
+        ys_pad: torch.Tensor,
+        ys_pad_lens: torch.Tensor,
+    ):
+        ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, self.ignore_id)
+        ys_in_lens = ys_pad_lens + 1
+
+        # 1. Forward decoder
+        decoder_out, _ = self.decoder(
+            encoder_out, encoder_out_lens, ys_in_pad, ys_in_lens
+        )
+
+        # 2. Compute attention loss
+        loss_att = self.criterion_mt(decoder_out, ys_out_pad)
+        acc_att = th_accuracy(
+            decoder_out.view(-1, self.vocab_size),
+            ys_out_pad,
+            ignore_label=self.ignore_id,
+        )
+
+        # Compute cer/wer using attention-decoder
+        if self.training or self.mt_error_calculator is None:
+            bleu_att = None
+        else:
+            ys_hat = decoder_out.argmax(dim=-1)
+            bleu_att = self.mt_error_calculator(ys_hat.cpu(), ys_pad.cpu())
+
+        return loss_att, acc_att, bleu_att
diff --git a/espnet2/mt/frontend/__init__.py b/espnet2/mt/frontend/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/espnet2/mt/frontend/embedding.py b/espnet2/mt/frontend/embedding.py
new file mode 100644
index 00000000000..b9044c1385f
--- /dev/null
+++ b/espnet2/mt/frontend/embedding.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+#  2020, Technische Universität München;  Ludwig Kürzinger
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Embedding Frontend for text based inputs."""
+
+from espnet.nets.pytorch_backend.transformer.embedding import PositionalEncoding
+from espnet2.asr.frontend.abs_frontend import AbsFrontend
+import torch
+from typeguard import check_argument_types
+from typing import Tuple
+
+
+class Embedding(AbsFrontend):
+    """Embedding Frontend for text based inputs."""
+
+    def __init__(
+        self,
+        input_size: int = 400,
+        embed_dim: int = 400,
+        pos_enc_class=PositionalEncoding,
+        positional_dropout_rate: float = 0.1,
+    ):
+        """Initialize.
+
+        Args:
+            input_size: Number of input tokens.
+            embed_dim: Embedding Size.
+            pos_enc_class: PositionalEncoding or ScaledPositionalEncoding
+            positional_dropout_rate: dropout rate after adding positional encoding
+        """
+        assert check_argument_types()
+        super().__init__()
+        self.embed_dim = embed_dim
+        # TODO(sdalmia): check for padding idx
+        self.embed = torch.nn.Sequential(
+            torch.nn.Embedding(input_size, embed_dim),
+            pos_enc_class(embed_dim, positional_dropout_rate),
+        )
+
+    def forward(
+        self, input: torch.Tensor, input_lengths: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Apply a sliding window on the input.
+
+        Args:
+            input: Input (B, T) or (B, T,D), with D.
+            input_lengths: Input lengths within batch.
+
+        Returns:
+            Tensor: Output with dimensions (B, T, D).
+            Tensor: Output lengths within batch.
+        """
+        x = self.embed(input)
+
+        return x, input_lengths
+
+    def output_size(self) -> int:
+        """Return output length of feature dimension D, i.e. the embedding dim."""
+        return self.embed_dim
diff --git a/espnet2/schedulers/noam_lr.py b/espnet2/schedulers/noam_lr.py
index 407b8efafab..1c9aeb152da 100644
--- a/espnet2/schedulers/noam_lr.py
+++ b/espnet2/schedulers/noam_lr.py
@@ -47,7 +47,7 @@ def __init__(
         super().__init__(optimizer, last_epoch)
 
     def lr_for_WarmupLR(self, lr: float) -> float:
-        return lr / self.model_size ** 0.5 / self.warmup_steps ** 0.5
+        return lr / self.model_size**0.5 / self.warmup_steps**0.5
 
     def __repr__(self):
         return (
@@ -59,7 +59,7 @@ def get_lr(self):
         step_num = self.last_epoch + 1
         return [
             lr
-            * self.model_size ** -0.5
-            * min(step_num ** -0.5, step_num * self.warmup_steps ** -1.5)
+            * self.model_size**-0.5
+            * min(step_num**-0.5, step_num * self.warmup_steps**-1.5)
             for lr in self.base_lrs
         ]
diff --git a/espnet2/schedulers/warmup_lr.py b/espnet2/schedulers/warmup_lr.py
index d65b7c56328..9d029d000b8 100644
--- a/espnet2/schedulers/warmup_lr.py
+++ b/espnet2/schedulers/warmup_lr.py
@@ -44,7 +44,7 @@ def get_lr(self):
         step_num = self.last_epoch + 1
         return [
             lr
-            * self.warmup_steps ** 0.5
-            * min(step_num ** -0.5, step_num * self.warmup_steps ** -1.5)
+            * self.warmup_steps**0.5
+            * min(step_num**-0.5, step_num * self.warmup_steps**-1.5)
             for lr in self.base_lrs
         ]
diff --git a/espnet2/st/__init__.py b/espnet2/st/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/espnet2/st/espnet_model.py b/espnet2/st/espnet_model.py
new file mode 100644
index 00000000000..ee744681bd7
--- /dev/null
+++ b/espnet2/st/espnet_model.py
@@ -0,0 +1,459 @@
+from contextlib import contextmanager
+from distutils.version import LooseVersion
+import logging
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+from typeguard import check_argument_types
+
+from espnet.nets.e2e_asr_common import ErrorCalculator as ASRErrorCalculator
+from espnet.nets.e2e_mt_common import ErrorCalculator as MTErrorCalculator
+from espnet.nets.pytorch_backend.nets_utils import th_accuracy
+from espnet.nets.pytorch_backend.transformer.add_sos_eos import add_sos_eos
+from espnet.nets.pytorch_backend.transformer.label_smoothing_loss import (
+    LabelSmoothingLoss,  # noqa: H301
+)
+from espnet2.asr.ctc import CTC
+from espnet2.asr.decoder.abs_decoder import AbsDecoder
+from espnet2.asr.encoder.abs_encoder import AbsEncoder
+from espnet2.asr.frontend.abs_frontend import AbsFrontend
+from espnet2.asr.postencoder.abs_postencoder import AbsPostEncoder
+from espnet2.asr.preencoder.abs_preencoder import AbsPreEncoder
+from espnet2.asr.specaug.abs_specaug import AbsSpecAug
+from espnet2.layers.abs_normalize import AbsNormalize
+from espnet2.torch_utils.device_funcs import force_gatherable
+from espnet2.train.abs_espnet_model import AbsESPnetModel
+
+if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"):
+    from torch.cuda.amp import autocast
+else:
+    # Nothing to do if torch<1.6.0
+    @contextmanager
+    def autocast(enabled=True):
+        yield
+
+
+class ESPnetSTModel(AbsESPnetModel):
+    """CTC-attention hybrid Encoder-Decoder model"""
+
+    def __init__(
+        self,
+        vocab_size: int,
+        token_list: Union[Tuple[str, ...], List[str]],
+        frontend: Optional[AbsFrontend],
+        specaug: Optional[AbsSpecAug],
+        normalize: Optional[AbsNormalize],
+        preencoder: Optional[AbsPreEncoder],
+        encoder: AbsEncoder,
+        postencoder: Optional[AbsPostEncoder],
+        decoder: AbsDecoder,
+        extra_asr_decoder: Optional[AbsDecoder],
+        extra_mt_decoder: Optional[AbsDecoder],
+        ctc: Optional[CTC],
+        src_vocab_size: Optional[int],
+        src_token_list: Optional[Union[Tuple[str, ...], List[str]]],
+        asr_weight: float = 0.0,
+        mt_weight: float = 0.0,
+        mtlalpha: float = 0.0,
+        ignore_id: int = -1,
+        lsm_weight: float = 0.0,
+        length_normalized_loss: bool = False,
+        report_cer: bool = True,
+        report_wer: bool = True,
+        report_bleu: bool = True,
+        sym_space: str = "<space>",
+        sym_blank: str = "<blank>",
+        extract_feats_in_collect_stats: bool = True,
+    ):
+        assert check_argument_types()
+        assert 0.0 <= asr_weight < 1.0, "asr_weight should be [0.0, 1.0)"
+        assert 0.0 <= mt_weight < 1.0, "mt_weight should be [0.0, 1.0)"
+        assert 0.0 <= mtlalpha <= 1.0, "mtlalpha should be [0.0, 1.0]"
+
+        super().__init__()
+        # note that eos is the same as sos (equivalent ID)
+        self.sos = vocab_size - 1
+        self.eos = vocab_size - 1
+        self.src_sos = src_vocab_size - 1
+        self.src_eos = src_vocab_size - 1
+        self.vocab_size = vocab_size
+        self.src_vocab_size = src_vocab_size
+        self.ignore_id = ignore_id
+        self.asr_weight = asr_weight
+        self.mt_weight = mt_weight
+        self.mtlalpha = mtlalpha
+        self.token_list = token_list.copy()
+
+        self.frontend = frontend
+        self.specaug = specaug
+        self.normalize = normalize
+        self.preencoder = preencoder
+        self.postencoder = postencoder
+        self.encoder = encoder
+        self.decoder = (
+            decoder  # TODO(jiatong): directly implement multi-decoder structure at here
+        )
+
+        self.criterion_st = LabelSmoothingLoss(
+            size=vocab_size,
+            padding_idx=ignore_id,
+            smoothing=lsm_weight,
+            normalize_length=length_normalized_loss,
+        )
+
+        self.criterion_asr = LabelSmoothingLoss(
+            size=src_vocab_size,
+            padding_idx=ignore_id,
+            smoothing=lsm_weight,
+            normalize_length=length_normalized_loss,
+        )
+
+        # submodule for ASR task
+        if self.asr_weight > 0:
+            assert (
+                src_token_list is not None
+            ), "Missing src_token_list, cannot add asr module to st model"
+            if self.mtlalpha > 0.0:
+                self.ctc = ctc
+            if self.mtlalpha < 1.0:
+                self.extra_asr_decoder = extra_asr_decoder
+            elif extra_asr_decoder is not None:
+                logging.warning(
+                    "Not using extra_asr_decoder because "
+                    "mtlalpha is set as {} (== 1.0)".format(mtlalpha),
+                )
+
+        # submodule for MT task
+        if self.mt_weight > 0:
+            self.extra_mt_decoder = extra_mt_decoder
+        elif extra_mt_decoder is not None:
+            logging.warning(
+                "Not using extra_mt_decoder because "
+                "mt_weight is set as {} (== 0)".format(mt_weight),
+            )
+
+        # MT error calculator
+        if report_bleu:
+            self.mt_error_calculator = MTErrorCalculator(
+                token_list, sym_space, sym_blank, report_bleu
+            )
+        else:
+            self.mt_error_calculator = None
+
+        # ASR error calculator
+        if report_cer or report_wer:
+            assert (
+                src_token_list is not None
+            ), "Missing src_token_list, cannot add asr module to st model"
+            self.asr_error_calculator = ASRErrorCalculator(
+                src_token_list, sym_space, sym_blank, report_cer, report_wer
+            )
+        else:
+            self.asr_error_calculator = None
+
+        self.extract_feats_in_collect_stats = extract_feats_in_collect_stats
+
+        # TODO(jiatong): add multilingual related functions
+
+    def forward(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+        src_text: Optional[torch.Tensor],
+        src_text_lengths: Optional[torch.Tensor],
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
+        """Frontend + Encoder + Decoder + Calc loss
+
+        Args:
+            speech: (Batch, Length, ...)
+            speech_lengths: (Batch,)
+            text: (Batch, Length)
+            text_lengths: (Batch,)
+            src_text: (Batch, length)
+            src_text_lengths: (Batch,)
+            kwargs: "utt_id" is among the input.
+        """
+        assert text_lengths.dim() == 1, text_lengths.shape
+        # Check that batch_size is unified
+        assert (
+            speech.shape[0]
+            == speech_lengths.shape[0]
+            == text.shape[0]
+            == text_lengths.shape[0]
+        ), (speech.shape, speech_lengths.shape, text.shape, text_lengths.shape)
+
+        # additional checks with valid src_text
+        if src_text is not None:
+            assert src_text_lengths.dim() == 1, src_text_lengths.shape
+            assert text.shape[0] == src_text.shape[0] == src_text_lengths.shape[0], (
+                text.shape,
+                src_text.shape,
+                src_text_lengths.shape,
+            )
+
+        batch_size = speech.shape[0]
+
+        # for data-parallel
+        text = text[:, : text_lengths.max()]
+        if src_text is not None:
+            src_text = src_text[:, : src_text_lengths.max()]
+
+        # 1. Encoder
+        encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
+
+        # 2a. Attention-decoder branch (ST)
+        loss_st_att, acc_st_att, bleu_st_att = self._calc_mt_att_loss(
+            encoder_out, encoder_out_lens, text, text_lengths, st=True
+        )
+
+        # 2b. CTC branch
+        if self.asr_weight > 0:
+            assert src_text is not None, "missing source text for asr sub-task of ST"
+
+        if self.asr_weight > 0 and self.mtlalpha > 0:
+            loss_asr_ctc, cer_asr_ctc = self._calc_ctc_loss(
+                encoder_out, encoder_out_lens, src_text, src_text_lengths
+            )
+        else:
+            loss_asr_ctc, cer_asr_ctc = 0, None
+
+        # 2c. Attention-decoder branch (extra ASR)
+        if self.asr_weight > 0 and self.mtlalpha < 1.0:
+            (
+                loss_asr_att,
+                acc_asr_att,
+                cer_asr_att,
+                wer_asr_att,
+            ) = self._calc_asr_att_loss(
+                encoder_out, encoder_out_lens, src_text, src_text_lengths
+            )
+        else:
+            loss_asr_att, acc_asr_att, cer_asr_att, wer_asr_att = 0, None, None, None
+
+        # 2d. Attention-decoder branch (extra MT)
+        if self.mt_weight > 0:
+            loss_mt_att, acc_mt_att = self._calc_mt_att_loss(
+                encoder_out, encoder_out_lens, text, text_lengths, st=False
+            )
+        else:
+            loss_mt_att, acc_mt_att = 0, None
+
+        # 3. Loss computation
+        asr_ctc_weight = self.mtlalpha
+        loss_st = loss_st_att
+        if asr_ctc_weight == 1.0:
+            loss_asr = loss_asr_ctc
+        elif asr_ctc_weight == 0.0:
+            loss_asr = loss_asr_att
+        else:
+            loss_asr = (
+                asr_ctc_weight * loss_asr_ctc + (1 - asr_ctc_weight) * loss_asr_att
+            )
+        loss_mt = self.mt_weight * loss_mt_att
+        loss = (
+            (1 - self.asr_weight - self.mt_weight) * loss_st
+            + self.asr_weight * loss_asr
+            + self.mt_weight * loss_mt
+        )
+
+        stats = dict(
+            loss=loss.detach(),
+            loss_asr=loss_asr.detach() if type(loss_asr) is not float else loss_asr,
+            loss_mt=loss_mt.detach() if type(loss_mt) is not float else loss_mt,
+            loss_st=loss_st.detach(),
+            acc_asr=acc_asr_att,
+            acc_mt=acc_mt_att,
+            acc=acc_st_att,
+            cer_ctc=cer_asr_ctc,
+            cer=cer_asr_att,
+            wer=wer_asr_att,
+            bleu=bleu_st_att,
+        )
+
+        # force_gatherable: to-device and to-tensor if scalar for DataParallel
+        loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
+        return loss, stats, weight
+
+    def collect_feats(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+        src_text: Optional[torch.Tensor],
+        src_text_lengths: Optional[torch.Tensor],
+        **kwargs,
+    ) -> Dict[str, torch.Tensor]:
+        if self.extract_feats_in_collect_stats:
+            feats, feats_lengths = self._extract_feats(speech, speech_lengths)
+        else:
+            # Generate dummy stats if extract_feats_in_collect_stats is False
+            logging.warning(
+                "Generating dummy stats for feats and feats_lengths, "
+                "because encoder_conf.extract_feats_in_collect_stats is "
+                f"{self.extract_feats_in_collect_stats}"
+            )
+            feats, feats_lengths = speech, speech_lengths
+        return {"feats": feats, "feats_lengths": feats_lengths}
+
+    def encode(
+        self, speech: torch.Tensor, speech_lengths: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Frontend + Encoder. Note that this method is used by st_inference.py
+
+        Args:
+            speech: (Batch, Length, ...)
+            speech_lengths: (Batch, )
+        """
+        with autocast(False):
+            # 1. Extract feats
+            feats, feats_lengths = self._extract_feats(speech, speech_lengths)
+
+            # 2. Data augmentation
+            if self.specaug is not None and self.training:
+                feats, feats_lengths = self.specaug(feats, feats_lengths)
+
+            # 3. Normalization for feature: e.g. Global-CMVN, Utterance-CMVN
+            if self.normalize is not None:
+                feats, feats_lengths = self.normalize(feats, feats_lengths)
+
+        # Pre-encoder, e.g. used for raw input data
+        if self.preencoder is not None:
+            feats, feats_lengths = self.preencoder(feats, feats_lengths)
+
+        # 4. Forward encoder
+        # feats: (Batch, Length, Dim)
+        # -> encoder_out: (Batch, Length2, Dim2)
+        encoder_out, encoder_out_lens, _ = self.encoder(feats, feats_lengths)
+
+        # Post-encoder, e.g. NLU
+        if self.postencoder is not None:
+            encoder_out, encoder_out_lens = self.postencoder(
+                encoder_out, encoder_out_lens
+            )
+
+        assert encoder_out.size(0) == speech.size(0), (
+            encoder_out.size(),
+            speech.size(0),
+        )
+        assert encoder_out.size(1) <= encoder_out_lens.max(), (
+            encoder_out.size(),
+            encoder_out_lens.max(),
+        )
+
+        return encoder_out, encoder_out_lens
+
+    def _extract_feats(
+        self, speech: torch.Tensor, speech_lengths: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        assert speech_lengths.dim() == 1, speech_lengths.shape
+
+        # for data-parallel
+        speech = speech[:, : speech_lengths.max()]
+
+        if self.frontend is not None:
+            # Frontend
+            #  e.g. STFT and Feature extract
+            #       data_loader may send time-domain signal in this case
+            # speech (Batch, NSamples) -> feats: (Batch, NFrames, Dim)
+            feats, feats_lengths = self.frontend(speech, speech_lengths)
+        else:
+            # No frontend and no feature extract
+            feats, feats_lengths = speech, speech_lengths
+        return feats, feats_lengths
+
+    def _calc_mt_att_loss(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_out_lens: torch.Tensor,
+        ys_pad: torch.Tensor,
+        ys_pad_lens: torch.Tensor,
+        st: bool = True,
+    ):
+        ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, self.ignore_id)
+        ys_in_lens = ys_pad_lens + 1
+
+        # 1. Forward decoder
+        if st:
+            decoder_out, _ = self.decoder(
+                encoder_out, encoder_out_lens, ys_in_pad, ys_in_lens
+            )
+        else:
+            decoder_out, _ = self.extra_mt_decoder(
+                encoder_out, encoder_out_lens, ys_in_pad, ys_in_lens
+            )
+
+        # 2. Compute attention loss
+        loss_att = self.criterion_st(decoder_out, ys_out_pad)
+        acc_att = th_accuracy(
+            decoder_out.view(-1, self.vocab_size),
+            ys_out_pad,
+            ignore_label=self.ignore_id,
+        )
+
+        # Compute cer/wer using attention-decoder
+        if self.training or self.mt_error_calculator is None:
+            bleu_att = None
+        else:
+            ys_hat = decoder_out.argmax(dim=-1)
+            bleu_att = self.mt_error_calculator(ys_hat.cpu(), ys_pad.cpu())
+
+        return loss_att, acc_att, bleu_att
+
+    def _calc_asr_att_loss(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_out_lens: torch.Tensor,
+        ys_pad: torch.Tensor,
+        ys_pad_lens: torch.Tensor,
+    ):
+        ys_in_pad, ys_out_pad = add_sos_eos(
+            ys_pad, self.src_sos, self.src_eos, self.ignore_id
+        )
+        ys_in_lens = ys_pad_lens + 1
+
+        # 1. Forward decoder
+        decoder_out, _ = self.extra_asr_decoder(
+            encoder_out, encoder_out_lens, ys_in_pad, ys_in_lens
+        )
+
+        # 2. Compute attention loss
+        loss_att = self.criterion_asr(decoder_out, ys_out_pad)
+        acc_att = th_accuracy(
+            decoder_out.view(-1, self.src_vocab_size),
+            ys_out_pad,
+            ignore_label=self.ignore_id,
+        )
+
+        # Compute cer/wer using attention-decoder
+        if self.training or self.asr_error_calculator is None:
+            cer_att, wer_att = None, None
+        else:
+            ys_hat = decoder_out.argmax(dim=-1)
+            cer_att, wer_att = self.asr_error_calculator(ys_hat.cpu(), ys_pad.cpu())
+
+        return loss_att, acc_att, cer_att, wer_att
+
+    def _calc_ctc_loss(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_out_lens: torch.Tensor,
+        ys_pad: torch.Tensor,
+        ys_pad_lens: torch.Tensor,
+    ):
+        # Calc CTC loss
+        loss_ctc = self.ctc(encoder_out, encoder_out_lens, ys_pad, ys_pad_lens)
+
+        # Calc CER using CTC
+        cer_ctc = None
+        if not self.training and self.asr_error_calculator is not None:
+            ys_hat = self.ctc.argmax(encoder_out).data
+            cer_ctc = self.asr_error_calculator(ys_hat.cpu(), ys_pad.cpu(), is_ctc=True)
+        return loss_ctc, cer_ctc
diff --git a/espnet2/tasks/abs_task.py b/espnet2/tasks/abs_task.py
index 5e8044be2ce..37dda7259e9 100644
--- a/espnet2/tasks/abs_task.py
+++ b/espnet2/tasks/abs_task.py
@@ -1300,7 +1300,7 @@ def main_worker(cls, args: argparse.Namespace):
                         name=name,
                         dir=output_dir,
                         id=args.wandb_id,
-                        resume="allow",
+                        resume=args.resume,
                     )
                     wandb.config.update(args)
                 else:
diff --git a/espnet2/tasks/asr.py b/espnet2/tasks/asr.py
index 6305f8be414..9ab3c9ca7fd 100644
--- a/espnet2/tasks/asr.py
+++ b/espnet2/tasks/asr.py
@@ -14,6 +14,7 @@
 
 from espnet2.asr.ctc import CTC
 from espnet2.asr.decoder.abs_decoder import AbsDecoder
+from espnet2.asr.decoder.mlm_decoder import MLMDecoder
 from espnet2.asr.decoder.rnn_decoder import RNNDecoder
 from espnet2.asr.decoder.transformer_decoder import (
     DynamicConvolution2DTransformerDecoder,  # noqa: H301
@@ -28,6 +29,8 @@
 from espnet2.asr.decoder.transformer_decoder import TransformerDecoder
 from espnet2.asr.encoder.abs_encoder import AbsEncoder
 from espnet2.asr.encoder.conformer_encoder import ConformerEncoder
+from espnet2.asr.encoder.longformer_encoder import LongformerEncoder
+
 from espnet2.asr.encoder.hubert_encoder import FairseqHubertEncoder
 from espnet2.asr.encoder.hubert_encoder import FairseqHubertPretrainEncoder
 from espnet2.asr.encoder.rnn_encoder import RNNEncoder
@@ -46,6 +49,7 @@
 from espnet2.asr.frontend.fused import FusedFrontends
 from espnet2.asr.frontend.s3prl import S3prlFrontend
 from espnet2.asr.frontend.windowing import SlidingWindow
+from espnet2.asr.maskctc_model import MaskCTCModel
 from espnet2.asr.postencoder.abs_postencoder import AbsPostEncoder
 from espnet2.asr.postencoder.hugging_face_transformers_postencoder import (
     HuggingFaceTransformersPostEncoder,  # noqa: H301
@@ -63,6 +67,7 @@
 from espnet2.tasks.abs_task import AbsTask
 from espnet2.text.phoneme_tokenizer import g2p_choices
 from espnet2.torch_utils.initialize import initialize
+from espnet2.train.abs_espnet_model import AbsESPnetModel
 from espnet2.train.class_choices import ClassChoices
 from espnet2.train.collate_fn import CommonCollateFn
 from espnet2.train.preprocessor import CommonPreprocessor
@@ -87,7 +92,9 @@
 )
 specaug_choices = ClassChoices(
     name="specaug",
-    classes=dict(specaug=SpecAug),
+    classes=dict(
+        specaug=SpecAug,
+    ),
     type_check=AbsSpecAug,
     default=None,
     optional=True,
@@ -102,6 +109,15 @@
     default="utterance_mvn",
     optional=True,
 )
+model_choices = ClassChoices(
+    "model",
+    classes=dict(
+        espnet=ESPnetASRModel,
+        maskctc=MaskCTCModel,
+    ),
+    type_check=AbsESPnetModel,
+    default="espnet",
+)
 preencoder_choices = ClassChoices(
     name="preencoder",
     classes=dict(
@@ -124,6 +140,7 @@
         wav2vec2=FairSeqWav2Vec2Encoder,
         hubert=FairseqHubertEncoder,
         hubert_pretrain=FairseqHubertPretrainEncoder,
+        longformer=LongformerEncoder,
     ),
     type_check=AbsEncoder,
     default="rnn",
@@ -147,6 +164,7 @@
         dynamic_conv2d=DynamicConvolution2DTransformerDecoder,
         rnn=RNNDecoder,
         transducer=TransducerDecoder,
+        mlm=MLMDecoder,
     ),
     type_check=AbsDecoder,
     default="rnn",
@@ -165,6 +183,8 @@ class ASRTask(AbsTask):
         specaug_choices,
         # --normalize and --normalize_conf
         normalize_choices,
+        # --model and --model_conf
+        model_choices,
         # --preencoder and --preencoder_conf
         preencoder_choices,
         # --encoder and --encoder_conf
@@ -227,12 +247,6 @@ def add_task_arguments(cls, parser: argparse.ArgumentParser):
             default=None,
             help="The keyword arguments for joint network class.",
         )
-        group.add_argument(
-            "--model_conf",
-            action=NestedDictAction,
-            default=get_default_kwargs(ESPnetASRModel),
-            help="The keyword arguments for model class.",
-        )
 
         group = parser.add_argument_group(description="Preprocess related")
         group.add_argument(
@@ -475,11 +489,15 @@ def build_model(cls, args: argparse.Namespace) -> ESPnetASRModel:
 
         # 6. CTC
         ctc = CTC(
-            odim=vocab_size, encoder_output_sizse=encoder_output_size, **args.ctc_conf
+            odim=vocab_size, encoder_output_size=encoder_output_size, **args.ctc_conf
         )
 
-        # 8. Build model
-        model = ESPnetASRModel(
+        # 7. Build model
+        try:
+            model_class = model_choices.get_class(args.model)
+        except AttributeError:
+            model_class = model_choices.get_class("espnet")
+        model = model_class(
             vocab_size=vocab_size,
             frontend=frontend,
             specaug=specaug,
@@ -495,7 +513,7 @@ def build_model(cls, args: argparse.Namespace) -> ESPnetASRModel:
         )
 
         # FIXME(kamo): Should be done in model?
-        # 9. Initialize
+        # 8. Initialize
         if args.init is not None:
             initialize(model, args.init)
 
diff --git a/espnet2/tasks/enh.py b/espnet2/tasks/enh.py
index 2a722cba554..fd0742359da 100644
--- a/espnet2/tasks/enh.py
+++ b/espnet2/tasks/enh.py
@@ -21,20 +21,35 @@
 from espnet2.enh.encoder.stft_encoder import STFTEncoder
 from espnet2.enh.espnet_model import ESPnetEnhancementModel
 from espnet2.enh.loss.criterions.abs_loss import AbsEnhLoss
+from espnet2.enh.loss.criterions.tf_domain import FrequencyDomainAbsCoherence
+from espnet2.enh.loss.criterions.tf_domain import FrequencyDomainDPCL
 from espnet2.enh.loss.criterions.tf_domain import FrequencyDomainL1
 from espnet2.enh.loss.criterions.tf_domain import FrequencyDomainMSE
 from espnet2.enh.loss.criterions.time_domain import CISDRLoss
+from espnet2.enh.loss.criterions.time_domain import SDRLoss
 from espnet2.enh.loss.criterions.time_domain import SISNRLoss
 from espnet2.enh.loss.criterions.time_domain import SNRLoss
+from espnet2.enh.loss.criterions.time_domain import TimeDomainL1
+from espnet2.enh.loss.criterions.time_domain import TimeDomainMSE
 from espnet2.enh.loss.wrappers.abs_wrapper import AbsLossWrapper
+from espnet2.enh.loss.wrappers.dpcl_solver import DPCLSolver
 from espnet2.enh.loss.wrappers.fixed_order import FixedOrderSolver
+from espnet2.enh.loss.wrappers.multilayer_pit_solver import MultiLayerPITSolver
 from espnet2.enh.loss.wrappers.pit_solver import PITSolver
 from espnet2.enh.separator.abs_separator import AbsSeparator
 from espnet2.enh.separator.asteroid_models import AsteroidModel_Converter
 from espnet2.enh.separator.conformer_separator import ConformerSeparator
+from espnet2.enh.separator.dan_separator import DANSeparator
+from espnet2.enh.separator.dc_crn_separator import DC_CRNSeparator
+from espnet2.enh.separator.dccrn_separator import DCCRNSeparator
+from espnet2.enh.separator.dpcl_e2e_separator import DPCLE2ESeparator
+from espnet2.enh.separator.dpcl_separator import DPCLSeparator
 from espnet2.enh.separator.dprnn_separator import DPRNNSeparator
+from espnet2.enh.separator.fasnet_separator import FaSNetSeparator
 from espnet2.enh.separator.neural_beamformer import NeuralBeamformer
 from espnet2.enh.separator.rnn_separator import RNNSeparator
+from espnet2.enh.separator.skim_separator import SkiMSeparator
+from espnet2.enh.separator.svoice_separator import SVoiceSeparator
 from espnet2.enh.separator.tcn_separator import TCNSeparator
 from espnet2.enh.separator.transformer_separator import TransformerSeparator
 from espnet2.tasks.abs_task import AbsTask
@@ -57,13 +72,21 @@
 separator_choices = ClassChoices(
     name="separator",
     classes=dict(
+        asteroid=AsteroidModel_Converter,
+        conformer=ConformerSeparator,
+        dan=DANSeparator,
+        dc_crn=DC_CRNSeparator,
+        dccrn=DCCRNSeparator,
+        dpcl=DPCLSeparator,
+        dpcl_e2e=DPCLE2ESeparator,
+        dprnn=DPRNNSeparator,
+        fasnet=FaSNetSeparator,
         rnn=RNNSeparator,
+        skim=SkiMSeparator,
+        svoice=SVoiceSeparator,
         tcn=TCNSeparator,
-        dprnn=DPRNNSeparator,
         transformer=TransformerSeparator,
-        conformer=ConformerSeparator,
         wpe_beamformer=NeuralBeamformer,
-        asteroid=AsteroidModel_Converter,
     ),
     type_check=AbsSeparator,
     default="rnn",
@@ -78,7 +101,12 @@
 
 loss_wrapper_choices = ClassChoices(
     name="loss_wrappers",
-    classes=dict(pit=PITSolver, fixed_order=FixedOrderSolver),
+    classes=dict(
+        pit=PITSolver,
+        fixed_order=FixedOrderSolver,
+        multilayer_pit=MultiLayerPITSolver,
+        dpcl=DPCLSolver,
+    ),
     type_check=AbsLossWrapper,
     default=None,
 )
@@ -86,11 +114,18 @@
 criterion_choices = ClassChoices(
     name="criterions",
     classes=dict(
-        snr=SNRLoss,
         ci_sdr=CISDRLoss,
+        coh=FrequencyDomainAbsCoherence,
+        sdr=SDRLoss,
         si_snr=SISNRLoss,
-        mse=FrequencyDomainMSE,
+        snr=SNRLoss,
         l1=FrequencyDomainL1,
+        dpcl=FrequencyDomainDPCL,
+        l1_fd=FrequencyDomainL1,
+        l1_td=TimeDomainL1,
+        mse=FrequencyDomainMSE,
+        mse_fd=FrequencyDomainMSE,
+        mse_td=TimeDomainMSE,
     ),
     type_check=AbsEnhLoss,
     default=None,
@@ -225,12 +260,16 @@ def build_model(cls, args: argparse.Namespace) -> ESPnetEnhancementModel:
         decoder = decoder_choices.get_class(args.decoder)(**args.decoder_conf)
 
         loss_wrappers = []
-        for ctr in args.criterions:
-            criterion = criterion_choices.get_class(ctr["name"])(**ctr["conf"])
-            loss_wrapper = loss_wrapper_choices.get_class(ctr["wrapper"])(
-                criterion=criterion, **ctr["wrapper_conf"]
-            )
-            loss_wrappers.append(loss_wrapper)
+
+        if getattr(args, "criterions", None) is not None:
+            # This check is for the compatibility when load models
+            # that packed by older version
+            for ctr in args.criterions:
+                criterion = criterion_choices.get_class(ctr["name"])(**ctr["conf"])
+                loss_wrapper = loss_wrapper_choices.get_class(ctr["wrapper"])(
+                    criterion=criterion, **ctr["wrapper_conf"]
+                )
+                loss_wrappers.append(loss_wrapper)
 
         # 1. Build model
         model = ESPnetEnhancementModel(
diff --git a/espnet2/tasks/enh_s2t.py b/espnet2/tasks/enh_s2t.py
new file mode 100644
index 00000000000..d6a20bac700
--- /dev/null
+++ b/espnet2/tasks/enh_s2t.py
@@ -0,0 +1,475 @@
+import argparse
+import copy
+import logging
+from typing import Callable
+from typing import Collection
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+
+import numpy as np
+import torch
+from typeguard import check_argument_types
+from typeguard import check_return_type
+
+from espnet2.asr.ctc import CTC
+from espnet2.asr.espnet_model import ESPnetASRModel
+from espnet2.enh.espnet_enh_s2t_model import ESPnetEnhS2TModel
+from espnet2.enh.espnet_model import ESPnetEnhancementModel
+from espnet2.tasks.abs_task import AbsTask
+from espnet2.tasks.asr import ASRTask
+from espnet2.tasks.asr import decoder_choices as asr_decoder_choices_
+from espnet2.tasks.asr import encoder_choices as asr_encoder_choices_
+from espnet2.tasks.asr import frontend_choices
+from espnet2.tasks.asr import normalize_choices
+from espnet2.tasks.asr import postencoder_choices as asr_postencoder_choices_
+from espnet2.tasks.asr import preencoder_choices as asr_preencoder_choices_
+from espnet2.tasks.asr import specaug_choices
+from espnet2.tasks.enh import decoder_choices as enh_decoder_choices_
+from espnet2.tasks.enh import encoder_choices as enh_encoder_choices_
+from espnet2.tasks.enh import EnhancementTask
+from espnet2.tasks.enh import separator_choices as enh_separator_choices_
+from espnet2.tasks.st import decoder_choices as st_decoder_choices_
+from espnet2.tasks.st import encoder_choices as st_encoder_choices_
+from espnet2.tasks.st import extra_asr_decoder_choices as st_extra_asr_decoder_choices_
+from espnet2.tasks.st import extra_mt_decoder_choices as st_extra_mt_decoder_choices_
+from espnet2.tasks.st import postencoder_choices as st_postencoder_choices_
+from espnet2.tasks.st import preencoder_choices as st_preencoder_choices_
+from espnet2.tasks.st import STTask
+from espnet2.text.phoneme_tokenizer import g2p_choices
+from espnet2.torch_utils.initialize import initialize
+from espnet2.train.collate_fn import CommonCollateFn
+from espnet2.train.preprocessor import CommonPreprocessor_multi
+from espnet2.train.preprocessor import MutliTokenizerCommonPreprocessor
+from espnet2.train.trainer import Trainer
+from espnet2.utils.get_default_kwargs import get_default_kwargs
+from espnet2.utils.nested_dict_action import NestedDictAction
+from espnet2.utils.types import int_or_none
+from espnet2.utils.types import str2bool
+from espnet2.utils.types import str_or_none
+
+
+# Enhancement
+enh_encoder_choices = copy.deepcopy(enh_encoder_choices_)
+enh_encoder_choices.name = "enh_encoder"
+enh_decoder_choices = copy.deepcopy(enh_decoder_choices_)
+enh_decoder_choices.name = "enh_decoder"
+enh_separator_choices = copy.deepcopy(enh_separator_choices_)
+enh_separator_choices.name = "enh_separator"
+
+# ASR (also SLU)
+asr_preencoder_choices = copy.deepcopy(asr_preencoder_choices_)
+asr_preencoder_choices.name = "asr_preencoder"
+asr_encoder_choices = copy.deepcopy(asr_encoder_choices_)
+asr_encoder_choices.name = "asr_encoder"
+asr_postencoder_choices = copy.deepcopy(asr_postencoder_choices_)
+asr_postencoder_choices.name = "asr_postencoder"
+asr_decoder_choices = copy.deepcopy(asr_decoder_choices_)
+asr_decoder_choices.name = "asr_decoder"
+
+# ST
+st_preencoder_choices = copy.deepcopy(st_preencoder_choices_)
+st_preencoder_choices.name = "st_preencoder"
+st_encoder_choices = copy.deepcopy(st_encoder_choices_)
+st_encoder_choices.name = "st_encoder"
+st_postencoder_choices = copy.deepcopy(st_postencoder_choices_)
+st_postencoder_choices.name = "st_postencoder"
+st_decoder_choices = copy.deepcopy(st_decoder_choices_)
+st_decoder_choices.name = "st_decoder"
+st_extra_asr_decoder_choices = copy.deepcopy(st_extra_asr_decoder_choices_)
+st_extra_asr_decoder_choices.name = "st_extra_asr_decoder"
+st_extra_mt_decoder_choices = copy.deepcopy(st_extra_mt_decoder_choices_)
+st_extra_mt_decoder_choices.name = "st_extra_mt_decoder"
+
+MAX_REFERENCE_NUM = 100
+
+name2task = dict(
+    enh=EnhancementTask,
+    asr=ASRTask,
+    st=STTask,
+)
+
+# More can be added to the following attributes
+enh_attributes = [
+    "encoder",
+    "encoder_conf",
+    "separator",
+    "separator_conf",
+    "decoder",
+    "decoder_conf",
+    "criterions",
+]
+
+asr_attributes = [
+    "token_list",
+    "input_size",
+    "frontend",
+    "frontend_conf",
+    "specaug",
+    "specaug_conf",
+    "normalize",
+    "normalize_conf",
+    "preencoder",
+    "preencoder_conf",
+    "encoder",
+    "encoder_conf",
+    "postencoder",
+    "postencoder_conf",
+    "decoder",
+    "decoder_conf",
+    "ctc_conf",
+]
+
+st_attributes = [
+    "token_list",
+    "src_token_list",
+    "input_size",
+    "frontend",
+    "frontend_conf",
+    "specaug",
+    "specaug_conf",
+    "normalize",
+    "normalize_conf",
+    "preencoder",
+    "preencoder_conf",
+    "encoder",
+    "encoder_conf",
+    "postencoder",
+    "postencoder_conf",
+    "decoder",
+    "decoder_conf",
+    "ctc_conf",
+    "extra_asr_decoder",
+    "extra_asr_decoder_conf",
+    "extra_mt_decoder",
+    "extra_mt_decoder_conf",
+]
+
+
+class EnhS2TTask(AbsTask):
+    # If you need more than one optimizers, change this value
+    num_optimizers: int = 1
+
+    # Add variable objects configurations
+    class_choices_list = [
+        # --enh_encoder and --enh_encoder_conf
+        enh_encoder_choices,
+        # --enh_separator and --enh_separator_conf
+        enh_separator_choices,
+        # --enh_decoder and --enh_decoder_conf
+        enh_decoder_choices,
+        # --frontend and --frontend_conf
+        frontend_choices,
+        # --specaug and --specaug_conf
+        specaug_choices,
+        # --normalize and --normalize_conf
+        normalize_choices,
+        # --asr_preencoder and --asr_preencoder_conf
+        asr_preencoder_choices,
+        # --asr_encoder and --asr_encoder_conf
+        asr_encoder_choices,
+        # --asr_postencoder and --asr_postencoder_conf
+        asr_postencoder_choices,
+        # --asr_decoder and --asr_decoder_conf
+        asr_decoder_choices,
+        # --st_preencoder and --st_preencoder_conf
+        st_preencoder_choices,
+        # --st_encoder and --st_encoder_conf
+        st_encoder_choices,
+        # --st_postencoder and --st_postencoder_conf
+        st_postencoder_choices,
+        # --st_decoder and --st_decoder_conf
+        st_decoder_choices,
+        # --st_extra_asr_decoder and --st_extra_asr_decoder_conf
+        st_extra_asr_decoder_choices,
+        # --st_extra_mt_decoder and --st_extra_mt_decoder_conf
+        st_extra_mt_decoder_choices,
+    ]
+
+    # If you need to modify train() or eval() procedures, change Trainer class here
+    trainer = Trainer
+
+    @classmethod
+    def add_task_arguments(cls, parser: argparse.ArgumentParser):
+        group = parser.add_argument_group(description="Task related")
+
+        # NOTE(kamo): add_arguments(..., required=True) can't be used
+        # to provide --print_config mode. Instead of it, do as
+        required = parser.get_default("required")
+        required += ["token_list"]
+
+        group.add_argument(
+            "--token_list",
+            type=str_or_none,
+            default=None,
+            help="A text mapping int-id to token",
+        )
+        group.add_argument(
+            "--src_token_list",
+            type=str_or_none,
+            default=None,
+            help="A text mapping int-id to token (for source language)",
+        )
+        group.add_argument(
+            "--init",
+            type=lambda x: str_or_none(x.lower()),
+            default=None,
+            help="The initialization method",
+            choices=[
+                "chainer",
+                "xavier_uniform",
+                "xavier_normal",
+                "kaiming_uniform",
+                "kaiming_normal",
+                None,
+            ],
+        )
+
+        group.add_argument(
+            "--input_size",
+            type=int_or_none,
+            default=None,
+            help="The number of input dimension of the feature",
+        )
+
+        group.add_argument(
+            "--ctc_conf",
+            action=NestedDictAction,
+            default=get_default_kwargs(CTC),
+            help="The keyword arguments for CTC class.",
+        )
+
+        group.add_argument(
+            "--enh_criterions",
+            action=NestedDictAction,
+            default=[
+                {
+                    "name": "si_snr",
+                    "conf": {},
+                    "wrapper": "fixed_order",
+                    "wrapper_conf": {},
+                },
+            ],
+            help="The criterions binded with the loss wrappers.",
+        )
+
+        group.add_argument(
+            "--enh_model_conf",
+            action=NestedDictAction,
+            default=get_default_kwargs(ESPnetEnhancementModel),
+            help="The keyword arguments for enh submodel class.",
+        )
+
+        group.add_argument(
+            "--asr_model_conf",
+            action=NestedDictAction,
+            default=get_default_kwargs(ESPnetASRModel),
+            help="The keyword arguments for asr submodel class.",
+        )
+
+        group.add_argument(
+            "--st_model_conf",
+            action=NestedDictAction,
+            default=get_default_kwargs(ESPnetEnhancementModel),
+            help="The keyword arguments for st submodel class.",
+        )
+
+        group.add_argument(
+            "--subtask_series",
+            type=str,
+            nargs="+",
+            default=("enh", "asr"),
+            choices=["enh", "asr", "st"],
+            help="The series of subtasks in the pipeline.",
+        )
+
+        group.add_argument(
+            "--model_conf",
+            action=NestedDictAction,
+            default=get_default_kwargs(ESPnetEnhS2TModel),
+            help="The keyword arguments for model class.",
+        )
+
+        group = parser.add_argument_group(description="Preprocess related")
+        group.add_argument(
+            "--use_preprocessor",
+            type=str2bool,
+            default=False,
+            help="Apply preprocessing to data or not",
+        )
+        group.add_argument(
+            "--token_type",
+            type=str,
+            default="bpe",
+            choices=["bpe", "char", "word", "phn"],
+            help="The text will be tokenized " "in the specified level token",
+        )
+        group.add_argument(
+            "--bpemodel",
+            type=str_or_none,
+            default=None,
+            help="The model file of sentencepiece",
+        )
+        group.add_argument(
+            "--src_token_type",
+            type=str,
+            default="bpe",
+            choices=["bpe", "char", "word", "phn"],
+            help="The source text will be tokenized " "in the specified level token",
+        )
+        group.add_argument(
+            "--src_bpemodel",
+            type=str_or_none,
+            default=None,
+            help="The model file of sentencepiece (for source language)",
+        )
+        parser.add_argument(
+            "--non_linguistic_symbols",
+            type=str_or_none,
+            help="non_linguistic_symbols file path",
+        )
+        parser.add_argument(
+            "--cleaner",
+            type=str_or_none,
+            choices=[None, "tacotron", "jaconv", "vietnamese"],
+            default=None,
+            help="Apply text cleaning",
+        )
+        parser.add_argument(
+            "--g2p",
+            type=str_or_none,
+            choices=g2p_choices,
+            default=None,
+            help="Specify g2p method if --token_type=phn",
+        )
+
+        for class_choices in cls.class_choices_list:
+            # Append --<name> and --<name>_conf.
+            # e.g. --encoder and --encoder_conf
+            class_choices.add_arguments(group)
+
+    @classmethod
+    def build_collate_fn(
+        cls, args: argparse.Namespace, train: bool
+    ) -> Callable[
+        [Collection[Tuple[str, Dict[str, np.ndarray]]]],
+        Tuple[List[str], Dict[str, torch.Tensor]],
+    ]:
+        assert check_argument_types()
+        # NOTE(kamo): int value = 0 is reserved by CTC-blank symbol
+        return CommonCollateFn(float_pad_value=0.0, int_pad_value=-1)
+
+    @classmethod
+    def build_preprocess_fn(
+        cls, args: argparse.Namespace, train: bool
+    ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]:
+        assert check_argument_types()
+        if args.use_preprocessor:
+            if "st" in args.subtask_series:
+                retval = MutliTokenizerCommonPreprocessor(
+                    train=train,
+                    token_type=[args.token_type, args.src_token_type],
+                    token_list=[args.token_list, args.src_token_list],
+                    bpemodel=[args.bpemodel, args.src_bpemodel],
+                    non_linguistic_symbols=args.non_linguistic_symbols,
+                    text_cleaner=args.cleaner,
+                    g2p_type=args.g2p,
+                    # NOTE(kamo): Check attribute existence for backward compatibility
+                    rir_scp=args.rir_scp if hasattr(args, "rir_scp") else None,
+                    rir_apply_prob=args.rir_apply_prob
+                    if hasattr(args, "rir_apply_prob")
+                    else 1.0,
+                    noise_scp=args.noise_scp if hasattr(args, "noise_scp") else None,
+                    noise_apply_prob=args.noise_apply_prob
+                    if hasattr(args, "noise_apply_prob")
+                    else 1.0,
+                    noise_db_range=args.noise_db_range
+                    if hasattr(args, "noise_db_range")
+                    else "13_15",
+                    speech_volume_normalize=args.speech_volume_normalize
+                    if hasattr(args, "speech_volume_normalize")
+                    else None,
+                    speech_name="speech",
+                    text_name=["text", "src_text"],
+                )
+            else:
+                retval = CommonPreprocessor_multi(
+                    train=train,
+                    token_type=args.token_type,
+                    token_list=args.token_list,
+                    bpemodel=args.bpemodel,
+                    non_linguistic_symbols=args.non_linguistic_symbols,
+                    text_name=["text"],
+                    text_cleaner=args.cleaner,
+                    g2p_type=args.g2p,
+                )
+        else:
+            retval = None
+        assert check_return_type(retval)
+        return retval
+
+    @classmethod
+    def required_data_names(
+        cls, train: bool = True, inference: bool = False
+    ) -> Tuple[str, ...]:
+        if not inference:
+            retval = ("speech", "speech_ref1", "text")
+        else:
+            # Recognition mode
+            retval = ("speech",)
+        return retval
+
+    @classmethod
+    def optional_data_names(
+        cls, train: bool = True, inference: bool = False
+    ) -> Tuple[str, ...]:
+        retval = ["dereverb_ref1"]
+        retval += ["speech_ref{}".format(n) for n in range(2, MAX_REFERENCE_NUM + 1)]
+        retval += ["noise_ref{}".format(n) for n in range(1, MAX_REFERENCE_NUM + 1)]
+        retval += ["src_text"]
+        retval = tuple(retval)
+        assert check_return_type(retval)
+        return retval
+
+    @classmethod
+    def build_model(cls, args: argparse.Namespace) -> ESPnetEnhS2TModel:
+        assert check_argument_types()
+
+        # Build submodels in the order of subtask_series
+        model_conf = args.model_conf.copy()
+        for _, subtask in enumerate(args.subtask_series):
+            subtask_conf = dict(
+                init=None, model_conf=eval(f"args.{subtask}_model_conf")
+            )
+
+            for attr in eval(f"{subtask}_attributes"):
+                subtask_conf[attr] = (
+                    getattr(args, subtask + "_" + attr, None)
+                    if getattr(args, subtask + "_" + attr, None) is not None
+                    else getattr(args, attr, None)
+                )
+
+            if subtask in ["asr", "st"]:
+                m_subtask = "s2t"
+            elif subtask in ["enh"]:
+                m_subtask = subtask
+            else:
+                raise ValueError(f"{subtask} not supported.")
+
+            logging.info(f"Building {subtask} task model, using config: {subtask_conf}")
+
+            model_conf[f"{m_subtask}_model"] = name2task[subtask].build_model(
+                argparse.Namespace(**subtask_conf)
+            )
+
+        # 8. Build model
+        model = ESPnetEnhS2TModel(**model_conf)
+
+        # FIXME(kamo): Should be done in model?
+        # 9. Initialize
+        if args.init is not None:
+            initialize(model, args.init)
+
+        assert check_return_type(model)
+        return model
diff --git a/espnet2/tasks/enh_asr.py b/espnet2/tasks/mt.py
similarity index 58%
rename from espnet2/tasks/enh_asr.py
rename to espnet2/tasks/mt.py
index 49d83e26ee9..496b48b96e7 100644
--- a/espnet2/tasks/enh_asr.py
+++ b/espnet2/tasks/mt.py
@@ -12,34 +12,43 @@
 from typeguard import check_argument_types
 from typeguard import check_return_type
 
-from espnet2.asr.ctc import CTC
 from espnet2.asr.decoder.abs_decoder import AbsDecoder
 from espnet2.asr.decoder.rnn_decoder import RNNDecoder
+from espnet2.asr.decoder.transformer_decoder import (
+    DynamicConvolution2DTransformerDecoder,  # noqa: H301
+)
+from espnet2.asr.decoder.transformer_decoder import DynamicConvolutionTransformerDecoder
+from espnet2.asr.decoder.transformer_decoder import (
+    LightweightConvolution2DTransformerDecoder,  # noqa: H301
+)
+from espnet2.asr.decoder.transformer_decoder import (
+    LightweightConvolutionTransformerDecoder,  # noqa: H301
+)
 from espnet2.asr.decoder.transformer_decoder import TransformerDecoder
 from espnet2.asr.encoder.abs_encoder import AbsEncoder
+from espnet2.asr.encoder.conformer_encoder import ConformerEncoder
 from espnet2.asr.encoder.rnn_encoder import RNNEncoder
 from espnet2.asr.encoder.transformer_encoder import TransformerEncoder
+from espnet2.asr.encoder.contextual_block_transformer_encoder import (
+    ContextualBlockTransformerEncoder,  # noqa: H301
+)
 from espnet2.asr.encoder.vgg_rnn_encoder import VGGRNNEncoder
-from espnet2.asr.espnet_joint_model import ESPnetEnhASRModel
-from espnet2.asr.espnet_model import ESPnetASRModel
 from espnet2.asr.frontend.abs_frontend import AbsFrontend
-from espnet2.asr.frontend.default import DefaultFrontend
-from espnet2.asr.specaug.abs_specaug import AbsSpecAug
-from espnet2.asr.specaug.specaug import SpecAug
-from espnet2.enh.abs_enh import AbsEnhancement
-from espnet2.enh.espnet_model import ESPnetEnhancementModel
-from espnet2.enh.nets.beamformer_net import BeamformerNet
-from espnet2.enh.nets.tasnet import TasNet
-from espnet2.enh.nets.tf_mask_net import TFMaskingNet
-from espnet2.layers.abs_normalize import AbsNormalize
-from espnet2.layers.global_mvn import GlobalMVN
-from espnet2.layers.utterance_mvn import UtteranceMVN
+from espnet2.asr.postencoder.abs_postencoder import AbsPostEncoder
+from espnet2.asr.postencoder.hugging_face_transformers_postencoder import (
+    HuggingFaceTransformersPostEncoder,  # noqa: H301
+)
+from espnet2.asr.preencoder.abs_preencoder import AbsPreEncoder
+from espnet2.asr.preencoder.linear import LinearProjection
+from espnet2.asr.preencoder.sinc import LightweightSincConvs
+from espnet2.mt.espnet_model import ESPnetMTModel
+from espnet2.mt.frontend.embedding import Embedding
 from espnet2.tasks.abs_task import AbsTask
 from espnet2.text.phoneme_tokenizer import g2p_choices
 from espnet2.torch_utils.initialize import initialize
 from espnet2.train.class_choices import ClassChoices
 from espnet2.train.collate_fn import CommonCollateFn
-from espnet2.train.preprocessor import CommonPreprocessor_multi
+from espnet2.train.preprocessor import MutliTokenizerCommonPreprocessor
 from espnet2.train.trainer import Trainer
 from espnet2.utils.get_default_kwargs import get_default_kwargs
 from espnet2.utils.nested_dict_action import NestedDictAction
@@ -47,71 +56,74 @@
 from espnet2.utils.types import str2bool
 from espnet2.utils.types import str_or_none
 
-enh_choices = ClassChoices(
-    name="enh",
-    classes=dict(tf_masking=TFMaskingNet, tasnet=TasNet, wpe_beamformer=BeamformerNet),
-    type_check=AbsEnhancement,
-    default="tf_masking",
-)
 frontend_choices = ClassChoices(
     name="frontend",
-    classes=dict(default=DefaultFrontend),
+    classes=dict(
+        embed=Embedding,
+    ),
     type_check=AbsFrontend,
-    default="default",
-)
-specaug_choices = ClassChoices(
-    name="specaug",
-    classes=dict(specaug=SpecAug),
-    type_check=AbsSpecAug,
-    default=None,
-    optional=True,
+    default="embed",
 )
-normalize_choices = ClassChoices(
-    "normalize",
+preencoder_choices = ClassChoices(
+    name="preencoder",
     classes=dict(
-        global_mvn=GlobalMVN,
-        utterance_mvn=UtteranceMVN,
+        sinc=LightweightSincConvs,
+        linear=LinearProjection,
     ),
-    type_check=AbsNormalize,
-    default="utterance_mvn",
+    type_check=AbsPreEncoder,
+    default=None,
     optional=True,
 )
 encoder_choices = ClassChoices(
     "encoder",
     classes=dict(
+        conformer=ConformerEncoder,
         transformer=TransformerEncoder,
+        contextual_block_transformer=ContextualBlockTransformerEncoder,
         vgg_rnn=VGGRNNEncoder,
         rnn=RNNEncoder,
     ),
     type_check=AbsEncoder,
     default="rnn",
 )
+postencoder_choices = ClassChoices(
+    name="postencoder",
+    classes=dict(
+        hugging_face_transformers=HuggingFaceTransformersPostEncoder,
+    ),
+    type_check=AbsPostEncoder,
+    default=None,
+    optional=True,
+)
 decoder_choices = ClassChoices(
     "decoder",
-    classes=dict(transformer=TransformerDecoder, rnn=RNNDecoder),
+    classes=dict(
+        transformer=TransformerDecoder,
+        lightweight_conv=LightweightConvolutionTransformerDecoder,
+        lightweight_conv2d=LightweightConvolution2DTransformerDecoder,
+        dynamic_conv=DynamicConvolutionTransformerDecoder,
+        dynamic_conv2d=DynamicConvolution2DTransformerDecoder,
+        rnn=RNNDecoder,
+    ),
     type_check=AbsDecoder,
     default="rnn",
 )
 
-MAX_REFERENCE_NUM = 100
-
 
-class ASRTask(AbsTask):
+class MTTask(AbsTask):
     # If you need more than one optimizers, change this value
     num_optimizers: int = 1
 
     # Add variable objects configurations
     class_choices_list = [
-        # --enh and --enh_conf
-        enh_choices,
         # --frontend and --frontend_conf
         frontend_choices,
-        # --specaug and --specaug_conf
-        specaug_choices,
-        # --normalize and --normalize_conf
-        normalize_choices,
+        # --preencoder and --preencoder_conf
+        preencoder_choices,
         # --encoder and --encoder_conf
         encoder_choices,
+        # --postencoder and --postencoder_conf
+        postencoder_choices,
         # --decoder and --decoder_conf
         decoder_choices,
     ]
@@ -126,13 +138,19 @@ def add_task_arguments(cls, parser: argparse.ArgumentParser):
         # NOTE(kamo): add_arguments(..., required=True) can't be used
         # to provide --print_config mode. Instead of it, do as
         required = parser.get_default("required")
-        required += ["token_list"]
+        required += ["src_token_list", "token_list"]
 
         group.add_argument(
             "--token_list",
             type=str_or_none,
             default=None,
-            help="A text mapping int-id to token",
+            help="A text mapping int-id to token (for target language)",
+        )
+        group.add_argument(
+            "--src_token_list",
+            type=str_or_none,
+            default=None,
+            help="A text mapping int-id to token (for source language)",
         )
         group.add_argument(
             "--init",
@@ -157,22 +175,9 @@ def add_task_arguments(cls, parser: argparse.ArgumentParser):
         )
 
         group.add_argument(
-            "--ctc_conf",
-            action=NestedDictAction,
-            default=get_default_kwargs(CTC),
-            help="The keyword arguments for CTC class.",
-        )
-        group.add_argument(
-            "--asr_model_conf",
-            action=NestedDictAction,
-            default=get_default_kwargs(ESPnetASRModel),
-            help="The keyword arguments for model class.",
-        )
-
-        group.add_argument(
-            "--enh_model_conf",
+            "--model_conf",
             action=NestedDictAction,
-            default=get_default_kwargs(ESPnetEnhancementModel),
+            default=get_default_kwargs(ESPnetMTModel),
             help="The keyword arguments for model class.",
         )
 
@@ -180,7 +185,7 @@ def add_task_arguments(cls, parser: argparse.ArgumentParser):
         group.add_argument(
             "--use_preprocessor",
             type=str2bool,
-            default=False,
+            default=True,
             help="Apply preprocessing to data or not",
         )
         group.add_argument(
@@ -188,13 +193,26 @@ def add_task_arguments(cls, parser: argparse.ArgumentParser):
             type=str,
             default="bpe",
             choices=["bpe", "char", "word", "phn"],
-            help="The text will be tokenized " "in the specified level token",
+            help="The target text will be tokenized " "in the specified level token",
+        )
+        group.add_argument(
+            "--src_token_type",
+            type=str,
+            default="bpe",
+            choices=["bpe", "char", "word", "phn"],
+            help="The source text will be tokenized " "in the specified level token",
         )
         group.add_argument(
             "--bpemodel",
             type=str_or_none,
             default=None,
-            help="The model file of sentencepiece",
+            help="The model file of sentencepiece (for target language)",
+        )
+        group.add_argument(
+            "--src_bpemodel",
+            type=str_or_none,
+            default=None,
+            help="The model file of sentencepiece (for source language)",
         )
         parser.add_argument(
             "--non_linguistic_symbols",
@@ -237,18 +255,16 @@ def build_preprocess_fn(
         cls, args: argparse.Namespace, train: bool
     ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]:
         assert check_argument_types()
-        # TODO(Jing): ask Kamo if it ok to support several args,
-        # like text_name = 'text_ref1' and 'text_ref2'
         if args.use_preprocessor:
-            retval = CommonPreprocessor_multi(
+            retval = MutliTokenizerCommonPreprocessor(
                 train=train,
-                token_type=args.token_type,
-                token_list=args.token_list,
-                bpemodel=args.bpemodel,
+                token_type=[args.token_type, args.src_token_type],
+                token_list=[args.token_list, args.src_token_list],
+                bpemodel=[args.bpemodel, args.src_bpemodel],
                 non_linguistic_symbols=args.non_linguistic_symbols,
-                text_name=["text_ref1", "text_ref2"],
                 text_cleaner=args.cleaner,
                 g2p_type=args.g2p,
+                text_name=["text", "src_text"],
             )
         else:
             retval = None
@@ -260,26 +276,25 @@ def required_data_names(
         cls, train: bool = True, inference: bool = False
     ) -> Tuple[str, ...]:
         if not inference:
-            retval = ("speech_mix", "speech_ref1", "text_ref1")
+            retval = ("src_text", "text")
         else:
             # Recognition mode
-            retval = ("speech_mix",)
+            retval = ("src_text",)
         return retval
 
     @classmethod
     def optional_data_names(
         cls, train: bool = True, inference: bool = False
     ) -> Tuple[str, ...]:
-        retval = ["dereverb_ref"]
-        retval += ["speech_ref{}".format(n) for n in range(2, MAX_REFERENCE_NUM + 1)]
-        retval += ["text_ref{}".format(n) for n in range(2, MAX_REFERENCE_NUM + 1)]
-        retval += ["noise_ref{}".format(n) for n in range(1, MAX_REFERENCE_NUM + 1)]
-        retval = tuple(retval)
+        if not inference:
+            retval = ()
+        else:
+            retval = ()
         assert check_return_type(retval)
         return retval
 
     @classmethod
-    def build_model(cls, args: argparse.Namespace) -> ESPnetEnhASRModel:
+    def build_model(cls, args: argparse.Namespace) -> ESPnetMTModel:
         assert check_argument_types()
         if isinstance(args.token_list, str):
             with open(args.token_list, encoding="utf-8") as f:
@@ -294,14 +309,27 @@ def build_model(cls, args: argparse.Namespace) -> ESPnetEnhASRModel:
         vocab_size = len(token_list)
         logging.info(f"Vocabulary size: {vocab_size }")
 
-        # 0. Build pre enhancement model
-        enh_model = enh_choices.get_class(args.enh)(**args.enh_conf)
+        if args.src_token_list is not None:
+            if isinstance(args.src_token_list, str):
+                with open(args.src_token_list, encoding="utf-8") as f:
+                    src_token_list = [line.rstrip() for line in f]
+
+                # Overwriting src_token_list to keep it as "portable".
+                args.src_token_list = list(src_token_list)
+            elif isinstance(args.src_token_list, (tuple, list)):
+                src_token_list = list(args.src_token_list)
+            else:
+                raise RuntimeError("token_list must be str or list")
+            src_vocab_size = len(src_token_list)
+            logging.info(f"Source vocabulary size: {src_vocab_size }")
+        else:
+            src_token_list, src_vocab_size = None, None
 
         # 1. frontend
         if args.input_size is None:
             # Extract features in the model
             frontend_class = frontend_choices.get_class(args.frontend)
-            frontend = frontend_class(**args.frontend_conf)
+            frontend = frontend_class(input_size=src_vocab_size, **args.frontend_conf)
             input_size = frontend.output_size()
         else:
             # Give features from data-loader
@@ -310,54 +338,52 @@ def build_model(cls, args: argparse.Namespace) -> ESPnetEnhASRModel:
             frontend = None
             input_size = args.input_size
 
-        # 2. Data augmentation for spectrogram
-        if args.specaug is not None:
-            specaug_class = specaug_choices.get_class(args.specaug)
-            specaug = specaug_class(**args.specaug_conf)
-        else:
-            specaug = None
-
-        # 3. Normalization layer
-        if args.normalize is not None:
-            normalize_class = normalize_choices.get_class(args.normalize)
-            normalize = normalize_class(**args.normalize_conf)
+        # 3. Pre-encoder input block
+        # NOTE(kan-bayashi): Use getattr to keep the compatibility
+        if getattr(args, "preencoder", None) is not None:
+            preencoder_class = preencoder_choices.get_class(args.preencoder)
+            preencoder = preencoder_class(**args.preencoder_conf)
+            input_size = preencoder.output_size()
         else:
-            normalize = None
+            preencoder = None
 
         # 4. Encoder
         encoder_class = encoder_choices.get_class(args.encoder)
         encoder = encoder_class(input_size=input_size, **args.encoder_conf)
 
+        # 5. Post-encoder block
+        # NOTE(kan-bayashi): Use getattr to keep the compatibility
+        encoder_output_size = encoder.output_size()
+        if getattr(args, "postencoder", None) is not None:
+            postencoder_class = postencoder_choices.get_class(args.postencoder)
+            postencoder = postencoder_class(
+                input_size=encoder_output_size, **args.postencoder_conf
+            )
+            encoder_output_size = postencoder.output_size()
+        else:
+            postencoder = None
+
         # 5. Decoder
         decoder_class = decoder_choices.get_class(args.decoder)
 
         decoder = decoder_class(
             vocab_size=vocab_size,
-            encoder_output_size=encoder.output_size(),
+            encoder_output_size=encoder_output_size,
             **args.decoder_conf,
         )
 
-        # 6. CTC
-        ctc = CTC(
-            odim=vocab_size, encoder_output_sizse=encoder.output_size(), **args.ctc_conf
-        )
-
-        # 7. RNN-T Decoder (Not implemented)
-        rnnt_decoder = None
-
         # 8. Build model
-        model = ESPnetEnhASRModel(
+        model = ESPnetMTModel(
             vocab_size=vocab_size,
-            enh=enh_model,
+            src_vocab_size=src_vocab_size,
             frontend=frontend,
-            specaug=specaug,
-            normalize=normalize,
+            preencoder=preencoder,
             encoder=encoder,
+            postencoder=postencoder,
             decoder=decoder,
-            ctc=ctc,
-            rnnt_decoder=rnnt_decoder,
             token_list=token_list,
-            **args.asr_model_conf,
+            src_token_list=src_token_list,
+            **args.model_conf,
         )
 
         # FIXME(kamo): Should be done in model?
diff --git a/espnet2/tasks/st.py b/espnet2/tasks/st.py
new file mode 100644
index 00000000000..2b992f0be4e
--- /dev/null
+++ b/espnet2/tasks/st.py
@@ -0,0 +1,579 @@
+import argparse
+import logging
+from typing import Callable
+from typing import Collection
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+
+import numpy as np
+import torch
+from typeguard import check_argument_types
+from typeguard import check_return_type
+
+from espnet2.asr.ctc import CTC
+from espnet2.asr.decoder.abs_decoder import AbsDecoder
+from espnet2.asr.decoder.rnn_decoder import RNNDecoder
+from espnet2.asr.decoder.transformer_decoder import (
+    DynamicConvolution2DTransformerDecoder,  # noqa: H301
+)
+from espnet2.asr.decoder.transformer_decoder import DynamicConvolutionTransformerDecoder
+from espnet2.asr.decoder.transformer_decoder import (
+    LightweightConvolution2DTransformerDecoder,  # noqa: H301
+)
+from espnet2.asr.decoder.transformer_decoder import (
+    LightweightConvolutionTransformerDecoder,  # noqa: H301
+)
+from espnet2.asr.decoder.transformer_decoder import TransformerDecoder
+from espnet2.asr.encoder.abs_encoder import AbsEncoder
+from espnet2.asr.encoder.conformer_encoder import ConformerEncoder
+from espnet2.asr.encoder.hubert_encoder import FairseqHubertEncoder
+from espnet2.asr.encoder.hubert_encoder import FairseqHubertPretrainEncoder
+from espnet2.asr.encoder.rnn_encoder import RNNEncoder
+from espnet2.asr.encoder.transformer_encoder import TransformerEncoder
+from espnet2.asr.encoder.contextual_block_transformer_encoder import (
+    ContextualBlockTransformerEncoder,  # noqa: H301
+)
+from espnet2.asr.encoder.vgg_rnn_encoder import VGGRNNEncoder
+from espnet2.asr.encoder.wav2vec2_encoder import FairSeqWav2Vec2Encoder
+from espnet2.asr.frontend.abs_frontend import AbsFrontend
+from espnet2.asr.frontend.default import DefaultFrontend
+from espnet2.asr.frontend.s3prl import S3prlFrontend
+from espnet2.asr.frontend.windowing import SlidingWindow
+from espnet2.asr.postencoder.abs_postencoder import AbsPostEncoder
+from espnet2.asr.postencoder.hugging_face_transformers_postencoder import (
+    HuggingFaceTransformersPostEncoder,  # noqa: H301
+)
+from espnet2.asr.preencoder.abs_preencoder import AbsPreEncoder
+from espnet2.asr.preencoder.linear import LinearProjection
+from espnet2.asr.preencoder.sinc import LightweightSincConvs
+from espnet2.asr.specaug.abs_specaug import AbsSpecAug
+from espnet2.asr.specaug.specaug import SpecAug
+from espnet2.layers.abs_normalize import AbsNormalize
+from espnet2.layers.global_mvn import GlobalMVN
+from espnet2.layers.utterance_mvn import UtteranceMVN
+from espnet2.st.espnet_model import ESPnetSTModel
+from espnet2.tasks.abs_task import AbsTask
+from espnet2.text.phoneme_tokenizer import g2p_choices
+from espnet2.torch_utils.initialize import initialize
+from espnet2.train.class_choices import ClassChoices
+from espnet2.train.collate_fn import CommonCollateFn
+from espnet2.train.preprocessor import MutliTokenizerCommonPreprocessor
+from espnet2.train.trainer import Trainer
+from espnet2.utils.get_default_kwargs import get_default_kwargs
+from espnet2.utils.nested_dict_action import NestedDictAction
+from espnet2.utils.types import float_or_none
+from espnet2.utils.types import int_or_none
+from espnet2.utils.types import str2bool
+from espnet2.utils.types import str_or_none
+
+frontend_choices = ClassChoices(
+    name="frontend",
+    classes=dict(
+        default=DefaultFrontend,
+        sliding_window=SlidingWindow,
+        s3prl=S3prlFrontend,
+    ),
+    type_check=AbsFrontend,
+    default="default",
+)
+specaug_choices = ClassChoices(
+    name="specaug",
+    classes=dict(specaug=SpecAug),
+    type_check=AbsSpecAug,
+    default=None,
+    optional=True,
+)
+normalize_choices = ClassChoices(
+    "normalize",
+    classes=dict(
+        global_mvn=GlobalMVN,
+        utterance_mvn=UtteranceMVN,
+    ),
+    type_check=AbsNormalize,
+    default="utterance_mvn",
+    optional=True,
+)
+preencoder_choices = ClassChoices(
+    name="preencoder",
+    classes=dict(
+        sinc=LightweightSincConvs,
+        linear=LinearProjection,
+    ),
+    type_check=AbsPreEncoder,
+    default=None,
+    optional=True,
+)
+encoder_choices = ClassChoices(
+    "encoder",
+    classes=dict(
+        conformer=ConformerEncoder,
+        transformer=TransformerEncoder,
+        contextual_block_transformer=ContextualBlockTransformerEncoder,
+        vgg_rnn=VGGRNNEncoder,
+        rnn=RNNEncoder,
+        wav2vec2=FairSeqWav2Vec2Encoder,
+        hubert=FairseqHubertEncoder,
+        hubert_pretrain=FairseqHubertPretrainEncoder,
+    ),
+    type_check=AbsEncoder,
+    default="rnn",
+)
+postencoder_choices = ClassChoices(
+    name="postencoder",
+    classes=dict(
+        hugging_face_transformers=HuggingFaceTransformersPostEncoder,
+    ),
+    type_check=AbsPostEncoder,
+    default=None,
+    optional=True,
+)
+decoder_choices = ClassChoices(
+    "decoder",
+    classes=dict(
+        transformer=TransformerDecoder,
+        lightweight_conv=LightweightConvolutionTransformerDecoder,
+        lightweight_conv2d=LightweightConvolution2DTransformerDecoder,
+        dynamic_conv=DynamicConvolutionTransformerDecoder,
+        dynamic_conv2d=DynamicConvolution2DTransformerDecoder,
+        rnn=RNNDecoder,
+    ),
+    type_check=AbsDecoder,
+    default="rnn",
+)
+extra_asr_decoder_choices = ClassChoices(
+    "extra_asr_decoder",
+    classes=dict(
+        transformer=TransformerDecoder,
+        lightweight_conv=LightweightConvolutionTransformerDecoder,
+        lightweight_conv2d=LightweightConvolution2DTransformerDecoder,
+        dynamic_conv=DynamicConvolutionTransformerDecoder,
+        dynamic_conv2d=DynamicConvolution2DTransformerDecoder,
+        rnn=RNNDecoder,
+    ),
+    type_check=AbsDecoder,
+    default="rnn",
+)
+extra_mt_decoder_choices = ClassChoices(
+    "extra_mt_decoder",
+    classes=dict(
+        transformer=TransformerDecoder,
+        lightweight_conv=LightweightConvolutionTransformerDecoder,
+        lightweight_conv2d=LightweightConvolution2DTransformerDecoder,
+        dynamic_conv=DynamicConvolutionTransformerDecoder,
+        dynamic_conv2d=DynamicConvolution2DTransformerDecoder,
+        rnn=RNNDecoder,
+    ),
+    type_check=AbsDecoder,
+    default="rnn",
+)
+
+
+class STTask(AbsTask):
+    # If you need more than one optimizers, change this value
+    num_optimizers: int = 1
+
+    # Add variable objects configurations
+    class_choices_list = [
+        # --frontend and --frontend_conf
+        frontend_choices,
+        # --specaug and --specaug_conf
+        specaug_choices,
+        # --normalize and --normalize_conf
+        normalize_choices,
+        # --preencoder and --preencoder_conf
+        preencoder_choices,
+        # --encoder and --encoder_conf
+        encoder_choices,
+        # --postencoder and --postencoder_conf
+        postencoder_choices,
+        # --decoder and --decoder_conf
+        decoder_choices,
+        # --extra_asr_decoder and --extra_asr_decoder_conf
+        extra_asr_decoder_choices,
+        # --extra_mt_decoder and --extra_mt_decoder_conf
+        extra_mt_decoder_choices,
+    ]
+
+    # If you need to modify train() or eval() procedures, change Trainer class here
+    trainer = Trainer
+
+    @classmethod
+    def add_task_arguments(cls, parser: argparse.ArgumentParser):
+        group = parser.add_argument_group(description="Task related")
+
+        # NOTE(kamo): add_arguments(..., required=True) can't be used
+        # to provide --print_config mode. Instead of it, do as
+        required = parser.get_default("required")
+        required += ["token_list"]
+
+        group.add_argument(
+            "--token_list",
+            type=str_or_none,
+            default=None,
+            help="A text mapping int-id to token (for target language)",
+        )
+        group.add_argument(
+            "--src_token_list",
+            type=str_or_none,
+            default=None,
+            help="A text mapping int-id to token (for source language)",
+        )
+        group.add_argument(
+            "--init",
+            type=lambda x: str_or_none(x.lower()),
+            default=None,
+            help="The initialization method",
+            choices=[
+                "chainer",
+                "xavier_uniform",
+                "xavier_normal",
+                "kaiming_uniform",
+                "kaiming_normal",
+                None,
+            ],
+        )
+
+        group.add_argument(
+            "--input_size",
+            type=int_or_none,
+            default=None,
+            help="The number of input dimension of the feature",
+        )
+
+        group.add_argument(
+            "--ctc_conf",
+            action=NestedDictAction,
+            default=get_default_kwargs(CTC),
+            help="The keyword arguments for CTC class.",
+        )
+        group.add_argument(
+            "--model_conf",
+            action=NestedDictAction,
+            default=get_default_kwargs(ESPnetSTModel),
+            help="The keyword arguments for model class.",
+        )
+
+        group = parser.add_argument_group(description="Preprocess related")
+        group.add_argument(
+            "--use_preprocessor",
+            type=str2bool,
+            default=True,
+            help="Apply preprocessing to data or not",
+        )
+        group.add_argument(
+            "--token_type",
+            type=str,
+            default="bpe",
+            choices=["bpe", "char", "word", "phn"],
+            help="The target text will be tokenized " "in the specified level token",
+        )
+        group.add_argument(
+            "--src_token_type",
+            type=str,
+            default="bpe",
+            choices=["bpe", "char", "word", "phn"],
+            help="The source text will be tokenized " "in the specified level token",
+        )
+        group.add_argument(
+            "--bpemodel",
+            type=str_or_none,
+            default=None,
+            help="The model file of sentencepiece (for target language)",
+        )
+        group.add_argument(
+            "--src_bpemodel",
+            type=str_or_none,
+            default=None,
+            help="The model file of sentencepiece (for source language)",
+        )
+        parser.add_argument(
+            "--non_linguistic_symbols",
+            type=str_or_none,
+            help="non_linguistic_symbols file path",
+        )
+        parser.add_argument(
+            "--cleaner",
+            type=str_or_none,
+            choices=[None, "tacotron", "jaconv", "vietnamese"],
+            default=None,
+            help="Apply text cleaning",
+        )
+        parser.add_argument(
+            "--g2p",
+            type=str_or_none,
+            choices=g2p_choices,
+            default=None,
+            help="Specify g2p method if --token_type=phn",
+        )
+        parser.add_argument(
+            "--speech_volume_normalize",
+            type=float_or_none,
+            default=None,
+            help="Scale the maximum amplitude to the given value.",
+        )
+        parser.add_argument(
+            "--rir_scp",
+            type=str_or_none,
+            default=None,
+            help="The file path of rir scp file.",
+        )
+        parser.add_argument(
+            "--rir_apply_prob",
+            type=float,
+            default=1.0,
+            help="THe probability for applying RIR convolution.",
+        )
+        parser.add_argument(
+            "--noise_scp",
+            type=str_or_none,
+            default=None,
+            help="The file path of noise scp file.",
+        )
+        parser.add_argument(
+            "--noise_apply_prob",
+            type=float,
+            default=1.0,
+            help="The probability applying Noise adding.",
+        )
+        parser.add_argument(
+            "--noise_db_range",
+            type=str,
+            default="13_15",
+            help="The range of noise decibel level.",
+        )
+
+        for class_choices in cls.class_choices_list:
+            # Append --<name> and --<name>_conf.
+            # e.g. --encoder and --encoder_conf
+            class_choices.add_arguments(group)
+
+    @classmethod
+    def build_collate_fn(
+        cls, args: argparse.Namespace, train: bool
+    ) -> Callable[
+        [Collection[Tuple[str, Dict[str, np.ndarray]]]],
+        Tuple[List[str], Dict[str, torch.Tensor]],
+    ]:
+        assert check_argument_types()
+        # NOTE(kamo): int value = 0 is reserved by CTC-blank symbol
+        return CommonCollateFn(float_pad_value=0.0, int_pad_value=-1)
+
+    @classmethod
+    def build_preprocess_fn(
+        cls, args: argparse.Namespace, train: bool
+    ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]:
+        assert check_argument_types()
+        if args.use_preprocessor:
+            retval = MutliTokenizerCommonPreprocessor(
+                train=train,
+                token_type=[args.token_type, args.src_token_type],
+                token_list=[args.token_list, args.src_token_list],
+                bpemodel=[args.bpemodel, args.src_bpemodel],
+                non_linguistic_symbols=args.non_linguistic_symbols,
+                text_cleaner=args.cleaner,
+                g2p_type=args.g2p,
+                # NOTE(kamo): Check attribute existence for backward compatibility
+                rir_scp=args.rir_scp if hasattr(args, "rir_scp") else None,
+                rir_apply_prob=args.rir_apply_prob
+                if hasattr(args, "rir_apply_prob")
+                else 1.0,
+                noise_scp=args.noise_scp if hasattr(args, "noise_scp") else None,
+                noise_apply_prob=args.noise_apply_prob
+                if hasattr(args, "noise_apply_prob")
+                else 1.0,
+                noise_db_range=args.noise_db_range
+                if hasattr(args, "noise_db_range")
+                else "13_15",
+                speech_volume_normalize=args.speech_volume_normalize
+                if hasattr(args, "speech_volume_normalize")
+                else None,
+                speech_name="speech",
+                text_name=["text", "src_text"],
+            )
+        else:
+            retval = None
+        assert check_return_type(retval)
+        return retval
+
+    @classmethod
+    def required_data_names(
+        cls, train: bool = True, inference: bool = False
+    ) -> Tuple[str, ...]:
+        if not inference:
+            retval = ("speech", "text")
+        else:
+            # Recognition mode
+            retval = ("speech",)
+        return retval
+
+    @classmethod
+    def optional_data_names(
+        cls, train: bool = True, inference: bool = False
+    ) -> Tuple[str, ...]:
+        if not inference:
+            retval = ("src_text",)
+        else:
+            retval = ()
+        assert check_return_type(retval)
+        return retval
+
+    @classmethod
+    def build_model(cls, args: argparse.Namespace) -> ESPnetSTModel:
+        assert check_argument_types()
+        if isinstance(args.token_list, str):
+            with open(args.token_list, encoding="utf-8") as f:
+                token_list = [line.rstrip() for line in f]
+
+            # Overwriting token_list to keep it as "portable".
+            args.token_list = list(token_list)
+        elif isinstance(args.token_list, (tuple, list)):
+            token_list = list(args.token_list)
+        else:
+            raise RuntimeError("token_list must be str or list")
+        vocab_size = len(token_list)
+        logging.info(f"Vocabulary size: {vocab_size }")
+
+        if args.src_token_list is not None:
+            if isinstance(args.src_token_list, str):
+                with open(args.src_token_list, encoding="utf-8") as f:
+                    src_token_list = [line.rstrip() for line in f]
+
+                # Overwriting src_token_list to keep it as "portable".
+                args.src_token_list = list(src_token_list)
+            elif isinstance(args.src_token_list, (tuple, list)):
+                src_token_list = list(args.src_token_list)
+            else:
+                raise RuntimeError("token_list must be str or list")
+            src_vocab_size = len(src_token_list)
+            logging.info(f"Source vocabulary size: {src_vocab_size }")
+        else:
+            src_token_list, src_vocab_size = None, None
+
+        # 1. frontend
+        if args.input_size is None:
+            # Extract features in the model
+            frontend_class = frontend_choices.get_class(args.frontend)
+            frontend = frontend_class(**args.frontend_conf)
+            input_size = frontend.output_size()
+        else:
+            # Give features from data-loader
+            args.frontend = None
+            args.frontend_conf = {}
+            frontend = None
+            input_size = args.input_size
+
+        # 2. Data augmentation for spectrogram
+        if args.specaug is not None:
+            specaug_class = specaug_choices.get_class(args.specaug)
+            specaug = specaug_class(**args.specaug_conf)
+        else:
+            specaug = None
+
+        # 3. Normalization layer
+        if args.normalize is not None:
+            normalize_class = normalize_choices.get_class(args.normalize)
+            normalize = normalize_class(**args.normalize_conf)
+        else:
+            normalize = None
+
+        # 4. Pre-encoder input block
+        # NOTE(kan-bayashi): Use getattr to keep the compatibility
+        if getattr(args, "preencoder", None) is not None:
+            preencoder_class = preencoder_choices.get_class(args.preencoder)
+            preencoder = preencoder_class(**args.preencoder_conf)
+            input_size = preencoder.output_size()
+        else:
+            preencoder = None
+
+        # 4. Encoder
+        encoder_class = encoder_choices.get_class(args.encoder)
+        encoder = encoder_class(input_size=input_size, **args.encoder_conf)
+
+        # 5. Post-encoder block
+        # NOTE(kan-bayashi): Use getattr to keep the compatibility
+        encoder_output_size = encoder.output_size()
+        if getattr(args, "postencoder", None) is not None:
+            postencoder_class = postencoder_choices.get_class(args.postencoder)
+            postencoder = postencoder_class(
+                input_size=encoder_output_size, **args.postencoder_conf
+            )
+            encoder_output_size = postencoder.output_size()
+        else:
+            postencoder = None
+
+        # 5. Decoder
+        decoder_class = decoder_choices.get_class(args.decoder)
+
+        decoder = decoder_class(
+            vocab_size=vocab_size,
+            encoder_output_size=encoder_output_size,
+            **args.decoder_conf,
+        )
+
+        # 6. CTC
+        if src_token_list is not None:
+            ctc = CTC(
+                odim=src_vocab_size,
+                encoder_output_size=encoder_output_size,
+                **args.ctc_conf,
+            )
+        else:
+            ctc = None
+
+        # 7. ASR extra decoder
+        if (
+            getattr(args, "extra_asr_decoder", None) is not None
+            and src_token_list is not None
+        ):
+            extra_asr_decoder_class = extra_asr_decoder_choices.get_class(
+                args.extra_asr_decoder
+            )
+            extra_asr_decoder = extra_asr_decoder_class(
+                vocab_size=src_vocab_size,
+                encoder_output_size=encoder_output_size,
+                **args.extra_asr_decoder_conf,
+            )
+        else:
+            extra_asr_decoder = None
+
+        # 8. MT extra decoder
+        if getattr(args, "extra_mt_decoder", None) is not None:
+            extra_mt_decoder_class = extra_mt_decoder_choices.get_class(
+                args.extra_mt_decoder
+            )
+            extra_mt_decoder = extra_mt_decoder_class(
+                vocab_size=vocab_size,
+                encoder_output_size=encoder_output_size,
+                **args.extra_mt_decoder_conf,
+            )
+        else:
+            extra_asr_decoder = None
+
+        # 8. Build model
+        model = ESPnetSTModel(
+            vocab_size=vocab_size,
+            src_vocab_size=src_vocab_size,
+            frontend=frontend,
+            specaug=specaug,
+            normalize=normalize,
+            preencoder=preencoder,
+            encoder=encoder,
+            postencoder=postencoder,
+            decoder=decoder,
+            ctc=ctc,
+            extra_asr_decoder=extra_asr_decoder,
+            extra_mt_decoder=extra_mt_decoder,
+            token_list=token_list,
+            src_token_list=src_token_list,
+            **args.model_conf,
+        )
+
+        # FIXME(kamo): Should be done in model?
+        # 9. Initialize
+        if args.init is not None:
+            initialize(model, args.init)
+
+        assert check_return_type(model)
+        return model
diff --git a/espnet2/torch_utils/add_gradient_noise.py b/espnet2/torch_utils/add_gradient_noise.py
index dd488dd75b9..654e928ec00 100644
--- a/espnet2/torch_utils/add_gradient_noise.py
+++ b/espnet2/torch_utils/add_gradient_noise.py
@@ -23,7 +23,7 @@ def add_gradient_noise(
         scale_factor: {0.55}: The scale of `sigma`.
     """
     interval = (iteration // duration) + 1
-    sigma = eta / interval ** scale_factor
+    sigma = eta / interval**scale_factor
     for param in model.parameters():
         if param.grad is not None:
             _shape = param.grad.size()
diff --git a/espnet2/torch_utils/model_summary.py b/espnet2/torch_utils/model_summary.py
index df34b0b9a7f..8d7f14f8c79 100644
--- a/espnet2/torch_utils/model_summary.py
+++ b/espnet2/torch_utils/model_summary.py
@@ -35,7 +35,7 @@ def get_human_readable_count(number: int) -> str:
     num_groups = int(np.ceil(num_digits / 3))
     num_groups = min(num_groups, len(labels))  # don't abbreviate beyond trillions
     shift = -3 * (num_groups - 1)
-    number = number * (10 ** shift)
+    number = number * (10**shift)
     index = num_groups - 1
     return f"{number:.2f} {labels[index]}"
 
diff --git a/espnet2/train/preprocessor.py b/espnet2/train/preprocessor.py
index cb5604ed204..bdf1c6437e8 100644
--- a/espnet2/train/preprocessor.py
+++ b/espnet2/train/preprocessor.py
@@ -4,6 +4,7 @@
 from typing import Collection
 from typing import Dict
 from typing import Iterable
+from typing import List
 from typing import Union
 
 import numpy as np
@@ -102,7 +103,7 @@ def detect_non_silence(
     )
     framed_w *= scipy.signal.get_window(window, frame_length).astype(framed_w.dtype)
     # power: (C, T)
-    power = (framed_w ** 2).mean(axis=-1)
+    power = (framed_w**2).mean(axis=-1)
     # mean_power: (C, 1)
     mean_power = np.mean(power, axis=-1, keepdims=True)
     if np.all(mean_power == 0):
@@ -208,11 +209,10 @@ def __init__(
         else:
             self.noises = None
 
-    def __call__(
-        self, uid: str, data: Dict[str, Union[str, np.ndarray]]
-    ) -> Dict[str, np.ndarray]:
+    def _speech_process(
+        self, data: Dict[str, Union[str, np.ndarray]]
+    ) -> Dict[str, Union[str, np.ndarray]]:
         assert check_argument_types()
-
         if self.speech_name in data:
             if self.train and (self.rirs is not None or self.noises is not None):
                 speech = data[self.speech_name]
@@ -281,7 +281,7 @@ def __call__(
                         # noise: (Nmic, Time)
                         noise = noise.T
 
-                        noise_power = (noise ** 2).mean()
+                        noise_power = (noise**2).mean()
                         scale = (
                             10 ** (-noise_db / 20)
                             * np.sqrt(power)
@@ -299,7 +299,12 @@ def __call__(
                 speech = data[self.speech_name]
                 ma = np.max(np.abs(speech))
                 data[self.speech_name] = speech * self.speech_volume_normalize / ma
+        assert check_return_type(data)
+        return data
 
+    def _text_process(
+        self, data: Dict[str, Union[str, np.ndarray]]
+    ) -> Dict[str, np.ndarray]:
         if self.text_name in data and self.tokenizer is not None:
             text = data[self.text_name]
             text = self.text_cleaner(text)
@@ -309,6 +314,15 @@ def __call__(
         assert check_return_type(data)
         return data
 
+    def __call__(
+        self, uid: str, data: Dict[str, Union[str, np.ndarray]]
+    ) -> Dict[str, np.ndarray]:
+        assert check_argument_types()
+
+        data = self._speech_process(data)
+        data = self._text_process(data)
+        return data
+
 
 class CommonPreprocessor_multi(AbsPreprocessor):
     def __init__(
@@ -324,7 +338,7 @@ def __init__(
         non_linguistic_symbols: Union[Path, str, Iterable[str]] = None,
         delimiter: str = None,
         speech_name: str = "speech",
-        text_name: list = ["text"],
+        text_name: List[str] = ["text"],
     ):
         super().__init__(train)
         self.train = train
@@ -353,6 +367,19 @@ def __init__(
             self.tokenizer = None
             self.token_id_converter = None
 
+    def _text_process(
+        self, data: Dict[str, Union[str, np.ndarray]]
+    ) -> Dict[str, np.ndarray]:
+        for text_n in self.text_name:
+            if text_n in data and self.tokenizer is not None:
+                text = data[text_n]
+                text = self.text_cleaner(text)
+                tokens = self.tokenizer.text2tokens(text)
+                text_ints = self.token_id_converter.tokens2ids(tokens)
+                data[text_n] = np.array(text_ints, dtype=np.int64)
+        assert check_return_type(data)
+        return data
+
     def __call__(
         self, uid: str, data: Dict[str, Union[str, np.ndarray]]
     ) -> Dict[str, np.ndarray]:
@@ -366,12 +393,99 @@ def __call__(
             # - Data augmentation
             pass
 
-        for text_n in self.text_name:
-            if text_n in data and self.tokenizer is not None:
-                text = data[text_n]
+        data = self._text_process(data)
+        return data
+
+
+class MutliTokenizerCommonPreprocessor(CommonPreprocessor):
+    def __init__(
+        self,
+        train: bool,
+        token_type: List[str] = [None],
+        token_list: List[Union[Path, str, Iterable[str]]] = [None],
+        bpemodel: List[Union[Path, str, Iterable[str]]] = [None],
+        text_cleaner: Collection[str] = None,
+        g2p_type: str = None,
+        unk_symbol: str = "<unk>",
+        space_symbol: str = "<space>",
+        non_linguistic_symbols: Union[Path, str, Iterable[str]] = None,
+        delimiter: str = None,
+        rir_scp: str = None,
+        rir_apply_prob: float = 1.0,
+        noise_scp: str = None,
+        noise_apply_prob: float = 1.0,
+        noise_db_range: str = "3_10",
+        speech_volume_normalize: float = None,
+        speech_name: str = "speech",
+        text_name: List[str] = ["text"],
+    ):
+        # TODO(jiatong): sync with Kamo and Jing on interface for preprocessor
+        super().__init__(
+            train=train,
+            token_type=token_type[0],
+            token_list=token_list[0],
+            bpemodel=bpemodel[0],
+            text_cleaner=text_cleaner,
+            g2p_type=g2p_type,
+            unk_symbol=unk_symbol,
+            space_symbol=space_symbol,
+            non_linguistic_symbols=non_linguistic_symbols,
+            delimiter=delimiter,
+            speech_name=speech_name,
+            text_name=text_name[0],
+            rir_scp=rir_scp,
+            rir_apply_prob=rir_apply_prob,
+            noise_scp=noise_scp,
+            noise_apply_prob=noise_apply_prob,
+            noise_db_range=noise_db_range,
+            speech_volume_normalize=speech_volume_normalize,
+        )
+
+        assert (
+            len(token_type) == len(token_list) == len(bpemodel) == len(text_name)
+        ), "token_type, token_list, bpemodel, or processing text_name mismatched"
+        self.num_tokenizer = len(token_type)
+        self.tokenizer = []
+        self.token_id_converter = []
+
+        for i in range(self.num_tokenizer):
+            if token_type[i] is not None:
+                if token_list[i] is None:
+                    raise ValueError("token_list is required if token_type is not None")
+
+                self.tokenizer.append(
+                    build_tokenizer(
+                        token_type=token_type[i],
+                        bpemodel=bpemodel[i],
+                        delimiter=delimiter,
+                        space_symbol=space_symbol,
+                        non_linguistic_symbols=non_linguistic_symbols,
+                        g2p_type=g2p_type,
+                    )
+                )
+                self.token_id_converter.append(
+                    TokenIDConverter(
+                        token_list=token_list[i],
+                        unk_symbol=unk_symbol,
+                    )
+                )
+            else:
+                self.tokenizer.append(None)
+                self.token_id_converter.append(None)
+
+        self.text_cleaner = TextCleaner(text_cleaner)
+        self.text_name = text_name  # override the text_name from CommonPreprocessor
+
+    def _text_process(
+        self, data: Dict[str, Union[str, np.ndarray]]
+    ) -> Dict[str, np.ndarray]:
+        for i in range(self.num_tokenizer):
+            text_name = self.text_name[i]
+            if text_name in data and self.tokenizer[i] is not None:
+                text = data[text_name]
                 text = self.text_cleaner(text)
-                tokens = self.tokenizer.text2tokens(text)
-                text_ints = self.token_id_converter.tokens2ids(tokens)
-                data[text_n] = np.array(text_ints, dtype=np.int64)
+                tokens = self.tokenizer[i].text2tokens(text)
+                text_ints = self.token_id_converter[i].tokens2ids(tokens)
+                data[text_name] = np.array(text_ints, dtype=np.int64)
         assert check_return_type(data)
         return data
diff --git a/espnet2/train/reporter.py b/espnet2/train/reporter.py
index 7975f544aed..a3c03995b54 100644
--- a/espnet2/train/reporter.py
+++ b/espnet2/train/reporter.py
@@ -237,7 +237,7 @@ def tensorboard_add_scalar(self, summary_writer, start: int = None):
             # values: List[ReportValue]
             values = stats_list[start:]
             v = aggregate(values)
-            summary_writer.add_scalar(key2, v, self.total_count)
+            summary_writer.add_scalar(f"{key2}", v, self.total_count)
 
     def wandb_log(self, start: int = None):
         import wandb
@@ -360,11 +360,11 @@ def finish_epoch(self, sub_reporter: SubReporter) -> None:
         if LooseVersion(torch.__version__) >= LooseVersion("1.4.0"):
             if torch.cuda.is_initialized():
                 stats["gpu_max_cached_mem_GB"] = (
-                    torch.cuda.max_memory_reserved() / 2 ** 30
+                    torch.cuda.max_memory_reserved() / 2**30
                 )
         else:
             if torch.cuda.is_available() and torch.cuda.max_memory_cached() > 0:
-                stats["gpu_cached_mem_GB"] = torch.cuda.max_memory_cached() / 2 ** 30
+                stats["gpu_cached_mem_GB"] = torch.cuda.max_memory_cached() / 2**30
 
         self.stats.setdefault(self.epoch, {})[sub_reporter.key] = stats
         sub_reporter.finished()
@@ -544,18 +544,24 @@ def _plot_stats(self, keys: Sequence[str], key2: str):
 
         return plt
 
-    def tensorboard_add_scalar(self, summary_writer, epoch: int = None):
+    def tensorboard_add_scalar(
+        self, summary_writer, epoch: int = None, key1: str = None
+    ):
         if epoch is None:
             epoch = self.get_epoch()
+            total_count = self.stats[epoch]["train"]["total_count"]
+            if key1 == "train":
+                summary_writer.add_scalar("iter_epoch", epoch, total_count)
 
-        for key1 in self.get_keys(epoch):
-            for key2 in self.stats[epoch][key1]:
-                if key2 in ("time", "total_count"):
-                    continue
+        if key1 is not None:
+            key1_iterator = tuple([key1])
+        else:
+            key1_iterator = self.get_keys(epoch)
+
+        for key1 in key1_iterator:
+            for key2 in self.get_keys2(key1):
                 summary_writer.add_scalar(
-                    f"{key1}_{key2}_epoch",
-                    self.stats[epoch][key1][key2],
-                    epoch,
+                    f"{key2}", self.stats[epoch][key1][key2], total_count
                 )
 
     def wandb_log(self, epoch: int = None):
diff --git a/espnet2/train/trainer.py b/espnet2/train/trainer.py
index 506ffcf0c78..304d3329264 100644
--- a/espnet2/train/trainer.py
+++ b/espnet2/train/trainer.py
@@ -253,9 +253,14 @@ def run(
         ):
             from torch.utils.tensorboard import SummaryWriter
 
-            summary_writer = SummaryWriter(str(output_dir / "tensorboard"))
+            train_summary_writer = SummaryWriter(
+                str(output_dir / "tensorboard" / "train")
+            )
+            valid_summary_writer = SummaryWriter(
+                str(output_dir / "tensorboard" / "valid")
+            )
         else:
-            summary_writer = None
+            train_summary_writer = None
 
         start_time = time.perf_counter()
         for iepoch in range(start_epoch, trainer_options.max_epoch + 1):
@@ -285,7 +290,7 @@ def run(
                     iterator=train_iter_factory.build_iter(iepoch),
                     reporter=sub_reporter,
                     scaler=scaler,
-                    summary_writer=summary_writer,
+                    summary_writer=train_summary_writer,
                     options=trainer_options,
                     distributed_option=distributed_option,
                 )
@@ -305,7 +310,7 @@ def run(
                         cls.plot_attention(
                             model=model,
                             output_dir=output_dir / "att_ws",
-                            summary_writer=summary_writer,
+                            summary_writer=train_summary_writer,
                             iterator=plot_attention_iter_factory.build_iter(iepoch),
                             reporter=sub_reporter,
                             options=trainer_options,
@@ -329,8 +334,9 @@ def run(
                 logging.info(reporter.log_message())
                 if trainer_options.use_matplotlib:
                     reporter.matplotlib_plot(output_dir / "images")
-                if summary_writer is not None:
-                    reporter.tensorboard_add_scalar(summary_writer)
+                if train_summary_writer is not None:
+                    reporter.tensorboard_add_scalar(train_summary_writer, key1="train")
+                    reporter.tensorboard_add_scalar(valid_summary_writer, key1="valid")
                 if trainer_options.use_wandb:
                     reporter.wandb_log()
 
@@ -418,6 +424,7 @@ def run(
                         output_dir=output_dir,
                         best_model_criterion=trainer_options.best_model_criterion,
                         nbest=keep_nbest_models,
+                        suffix=f"till{iepoch}epoch",
                     )
 
                 for e in range(1, iepoch):
@@ -495,7 +502,7 @@ def train_one_epoch(
         iterator_stop = torch.tensor(0).to("cuda" if ngpu > 0 else "cpu")
 
         start_time = time.perf_counter()
-        for iiter, (_, batch) in enumerate(
+        for iiter, (utt_id, batch) in enumerate(
             reporter.measure_iter_time(iterator, "iter_time"), 1
         ):
             assert isinstance(batch, dict), type(batch)
@@ -505,6 +512,8 @@ def train_one_epoch(
                 if iterator_stop > 0:
                     break
 
+            batch["utt_id"] = utt_id
+
             batch = to_device(batch, "cuda" if ngpu > 0 else "cpu")
             if no_forward_run:
                 all_steps_are_invalid = False
@@ -698,13 +707,15 @@ def validate_one_epoch(
         # [For distributed] Because iteration counts are not always equals between
         # processes, send stop-flag to the other processes if iterator is finished
         iterator_stop = torch.tensor(0).to("cuda" if ngpu > 0 else "cpu")
-        for (_, batch) in iterator:
+        for (utt_id, batch) in iterator:
             assert isinstance(batch, dict), type(batch)
             if distributed:
                 torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM)
                 if iterator_stop > 0:
                     break
 
+            batch["utt_id"] = utt_id
+
             batch = to_device(batch, "cuda" if ngpu > 0 else "cpu")
             if no_forward_run:
                 continue
@@ -756,6 +767,9 @@ def plot_attention(
                 len(next(iter(batch.values()))),
                 len(ids),
             )
+
+            batch["utt_id"] = ids
+
             batch = to_device(batch, "cuda" if ngpu > 0 else "cpu")
             if no_forward_run:
                 continue
diff --git a/espnet2/tts/espnet_model.py b/espnet2/tts/espnet_model.py
index 986c7d029a0..e09c4a35a55 100644
--- a/espnet2/tts/espnet_model.py
+++ b/espnet2/tts/espnet_model.py
@@ -67,6 +67,7 @@ def forward(
         spembs: Optional[torch.Tensor] = None,
         sids: Optional[torch.Tensor] = None,
         lids: Optional[torch.Tensor] = None,
+        **kwargs,
     ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
         """Caclualte outputs and return the loss tensor.
 
@@ -84,6 +85,7 @@ def forward(
             spembs (Optional[Tensor]): Speaker embedding tensor (B, D).
             sids (Optional[Tensor]): Speaker ID tensor (B, 1).
             lids (Optional[Tensor]): Language ID tensor (B, 1).
+            kwargs: "utt_id" is among the input.
 
         Returns:
             Tensor: Loss scalar tensor.
@@ -166,6 +168,7 @@ def collect_feats(
         spembs: Optional[torch.Tensor] = None,
         sids: Optional[torch.Tensor] = None,
         lids: Optional[torch.Tensor] = None,
+        **kwargs,
     ) -> Dict[str, torch.Tensor]:
         """Caclualte features and return them as a dict.
 
diff --git a/espnet2/utils/griffin_lim.py b/espnet2/utils/griffin_lim.py
index aa077476979..c1536d51b2b 100644
--- a/espnet2/utils/griffin_lim.py
+++ b/espnet2/utils/griffin_lim.py
@@ -45,7 +45,9 @@ def logmel2linear(
     fmin = 0 if fmin is None else fmin
     fmax = fs / 2 if fmax is None else fmax
     mspc = np.power(10.0, lmspc)
-    mel_basis = librosa.filters.mel(fs, n_fft, n_mels, fmin, fmax)
+    mel_basis = librosa.filters.mel(
+        sr=fs, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax
+    )
     inv_mel_basis = np.linalg.pinv(mel_basis)
     return np.maximum(EPS, np.dot(inv_mel_basis, mspc.T).T)
 
diff --git a/setup.py b/setup.py
index dd7dc887308..dba53a97c7f 100644
--- a/setup.py
+++ b/setup.py
@@ -31,10 +31,12 @@
         "ctc-segmentation<1.8,>=1.6.6",
         # TTS
         "pyworld>=0.2.10",
+        "pypinyin<=0.44.0",
         "espnet_tts_frontend",
         # ENH
         "ci_sdr",
         "pytorch_wpe",
+        "fast-bss-eval==0.1.3",
     ],
     # train: The modules invoked when training only.
     "train": [
@@ -69,11 +71,13 @@
         "torchaudio",
         "torch_optimizer",
         "fairscale",
-        "fairseq",
         "transformers",
-        "gtn",
+        "gtn==0.0.0",
+    ],
+    "setup": [
+        "numpy<=1.21.3",
+        "pytest-runner",
     ],
-    "setup": ["numpy", "pytest-runner"],
     "test": [
         "pytest>=3.3.0",
         "pytest-timeouts>=1.2.1",
@@ -82,12 +86,13 @@
         "hacking>=2.0.0",
         "mock>=2.0.0",
         "pycodestyle",
-        "jsondiff>=1.2.0",
+        "jsondiff<2.0.0,>=1.2.0",
         "flake8>=3.7.8",
         "flake8-docstrings>=1.3.1",
         "black",
     ],
     "doc": [
+        "Jinja2<3.1",
         "Sphinx==2.1.2",
         "sphinx-rtd-theme>=0.2.4",
         "sphinx-argparse>=0.2.5",
diff --git a/test/espnet2/asr/decoder/test_mlm_decoder.py b/test/espnet2/asr/decoder/test_mlm_decoder.py
new file mode 100644
index 00000000000..97887611abb
--- /dev/null
+++ b/test/espnet2/asr/decoder/test_mlm_decoder.py
@@ -0,0 +1,34 @@
+import pytest
+import torch
+
+from espnet2.asr.decoder.mlm_decoder import MLMDecoder
+
+
+@pytest.mark.parametrize("input_layer", ["linear", "embed"])
+@pytest.mark.parametrize("normalize_before", [True, False])
+@pytest.mark.parametrize("use_output_layer", [True, False])
+def test_MLMDecoder_backward(input_layer, normalize_before, use_output_layer):
+    vocab_size = 10
+    decoder = MLMDecoder(
+        vocab_size,
+        12,
+        linear_units=10,
+        num_blocks=2,
+        input_layer=input_layer,
+        normalize_before=normalize_before,
+        use_output_layer=use_output_layer,
+    )
+    x = torch.randn(2, 9, 12)
+    x_lens = torch.tensor([9, 7], dtype=torch.long)
+    if input_layer == "embed":
+        t = torch.randint(0, vocab_size + 1, [2, 4], dtype=torch.long)
+    else:
+        t = torch.randn(2, 4, vocab_size + 1)
+    t_lens = torch.tensor([4, 3], dtype=torch.long)
+    z_all, ys_in_lens = decoder(x, x_lens, t, t_lens)
+    z_all.sum().backward()
+
+
+def test_MLMDecoder_invalid_type():
+    with pytest.raises(ValueError):
+        MLMDecoder(10, 12, input_layer="foo")
diff --git a/test/espnet2/asr/decoder/test_transformer_decoder.py b/test/espnet2/asr/decoder/test_transformer_decoder.py
index df44bcc7e43..d01c5b07a64 100644
--- a/test/espnet2/asr/decoder/test_transformer_decoder.py
+++ b/test/espnet2/asr/decoder/test_transformer_decoder.py
@@ -217,7 +217,7 @@ def test_TransformerDecoder_batch_beam_search_online(
         use_output_layer=use_output_layer,
         linear_units=10,
     )
-    ctc = CTC(odim=vocab_size, encoder_output_sizse=encoder_output_size)
+    ctc = CTC(odim=vocab_size, encoder_output_size=encoder_output_size)
     ctc.to(dtype)
     ctc_scorer = CTCPrefixScorer(ctc=ctc, eos=vocab_size - 1)
     beam = BatchBeamSearchOnlineSim(
diff --git a/test/espnet2/asr/encoder/test_conformer_encoder.py b/test/espnet2/asr/encoder/test_conformer_encoder.py
index 43837c2ac4e..9acb0cd1b8a 100644
--- a/test/espnet2/asr/encoder/test_conformer_encoder.py
+++ b/test/espnet2/asr/encoder/test_conformer_encoder.py
@@ -1,6 +1,7 @@
 import pytest
 import torch
 
+from espnet2.asr.ctc import CTC
 from espnet2.asr.encoder.conformer_encoder import ConformerEncoder
 
 
@@ -17,12 +18,24 @@
         ("legacy", "legacy_rel_pos", "legacy_rel_selfattn"),
     ],
 )
+@pytest.mark.parametrize(
+    "interctc_layer_idx, interctc_use_conditioning",
+    [
+        ([], False),
+        ([1], False),
+        ([1], True),
+    ],
+)
+@pytest.mark.parametrize("stochastic_depth_rate", [0.0, 0.1, [0.1, 0.1]])
 def test_encoder_forward_backward(
     input_layer,
     positionwise_layer_type,
     rel_pos_type,
     pos_enc_layer_type,
     selfattention_layer_type,
+    interctc_layer_idx,
+    interctc_use_conditioning,
+    stochastic_depth_rate,
 ):
     encoder = ConformerEncoder(
         20,
@@ -39,13 +52,26 @@ def test_encoder_forward_backward(
         use_cnn_module=True,
         cnn_module_kernel=3,
         positionwise_layer_type=positionwise_layer_type,
+        interctc_layer_idx=interctc_layer_idx,
+        interctc_use_conditioning=interctc_use_conditioning,
+        stochastic_depth_rate=stochastic_depth_rate,
     )
     if input_layer == "embed":
         x = torch.randint(0, 10, [2, 32])
     else:
         x = torch.randn(2, 32, 20, requires_grad=True)
     x_lens = torch.LongTensor([32, 28])
-    y, _, _ = encoder(x, x_lens)
+    if len(interctc_layer_idx) > 0:
+        ctc = None
+        if interctc_use_conditioning:
+            vocab_size = 5
+            output_size = encoder.output_size()
+            ctc = CTC(odim=vocab_size, encoder_output_size=output_size)
+            encoder.conditioning_layer = torch.nn.Linear(vocab_size, output_size)
+        y, _, _ = encoder(x, x_lens, ctc=ctc)
+        y = y[0]
+    else:
+        y, _, _ = encoder(x, x_lens)
     y.sum().backward()
 
 
@@ -82,6 +108,21 @@ def test_encoder_invalid_rel_pos_combination():
         )
 
 
+def test_encoder_invalid_interctc_layer_idx():
+    with pytest.raises(AssertionError):
+        ConformerEncoder(
+            20,
+            num_blocks=2,
+            interctc_layer_idx=[0, 1],
+        )
+    with pytest.raises(AssertionError):
+        ConformerEncoder(
+            20,
+            num_blocks=2,
+            interctc_layer_idx=[1, 2],
+        )
+
+
 def test_encoder_output_size():
     encoder = ConformerEncoder(20, output_size=256)
     assert encoder.output_size() == 256
@@ -90,3 +131,18 @@ def test_encoder_output_size():
 def test_encoder_invalid_type():
     with pytest.raises(ValueError):
         ConformerEncoder(20, input_layer="fff")
+
+
+def test_encoder_invalid_stochastic_depth_rate():
+    with pytest.raises(ValueError):
+        ConformerEncoder(
+            20,
+            num_blocks=2,
+            stochastic_depth_rate=[0.1],
+        )
+    with pytest.raises(ValueError):
+        ConformerEncoder(
+            20,
+            num_blocks=2,
+            stochastic_depth_rate=[0.1, 0.1, 0.1],
+        )
diff --git a/test/espnet2/asr/encoder/test_longformer_encoder.py b/test/espnet2/asr/encoder/test_longformer_encoder.py
new file mode 100644
index 00000000000..8df5f5fc212
--- /dev/null
+++ b/test/espnet2/asr/encoder/test_longformer_encoder.py
@@ -0,0 +1,83 @@
+from espnet2.asr.encoder.longformer_encoder import LongformerEncoder
+import pytest
+import torch
+
+pytest.importorskip("longformer")
+
+
+@pytest.mark.parametrize(
+    "input_layer", ["linear", "conv2d", "conv2d2", "conv2d6", "conv2d8", "embed"]
+)
+@pytest.mark.parametrize("positionwise_layer_type", ["conv1d", "conv1d-linear"])
+@pytest.mark.parametrize(
+    "rel_pos_type, pos_enc_layer_type, selfattention_layer_type",
+    [
+        ("legacy", "abs_pos", "lf_selfattn"),
+    ],
+)
+def test_encoder_forward_backward(
+    input_layer,
+    positionwise_layer_type,
+    rel_pos_type,
+    pos_enc_layer_type,
+    selfattention_layer_type,
+):
+    pytest.importorskip("longformer")
+    encoder = LongformerEncoder(
+        20,
+        output_size=2,
+        attention_heads=2,
+        linear_units=4,
+        num_blocks=2,
+        input_layer=input_layer,
+        macaron_style=False,
+        rel_pos_type=rel_pos_type,
+        pos_enc_layer_type=pos_enc_layer_type,
+        selfattention_layer_type=selfattention_layer_type,
+        activation_type="swish",
+        use_cnn_module=True,
+        cnn_module_kernel=3,
+        positionwise_layer_type=positionwise_layer_type,
+        attention_windows=[10, 10],
+        attention_dilation=[1, 1],
+        attention_mode="sliding_chunks",
+    )
+    if input_layer == "embed":
+        x = torch.randint(0, 10, [2, 32])
+    else:
+        x = torch.randn(2, 32, 20, requires_grad=True)
+    x_lens = torch.LongTensor([32, 28])
+    y, _, _ = encoder(x, x_lens)
+    y.sum().backward()
+
+
+def test_encoder_invalid_layer_type():
+    pytest.importorskip("longformer")
+    with pytest.raises(ValueError):
+        LongformerEncoder(20, pos_enc_layer_type="abc_pos")
+    with pytest.raises(ValueError):
+        LongformerEncoder(20, pos_enc_layer_type="dummy")
+    with pytest.raises(ValueError):
+        LongformerEncoder(
+            20, pos_enc_layer_type="abc_pos", selfattention_layer_type="dummy"
+        )
+
+
+def test_encoder_invalid_windows_parameter():
+    pytest.importorskip("longformer")
+    with pytest.raises(ValueError):
+        LongformerEncoder(20, attention_windows=[1, 1], num_blocks=4)
+    with pytest.raises(ValueError):
+        LongformerEncoder(20, attention_dilation=[1, 1], num_blocks=4)
+
+
+def test_encoder_output_size():
+    pytest.importorskip("longformer")
+    encoder = LongformerEncoder(20, output_size=256)
+    assert encoder.output_size() == 256
+
+
+def test_encoder_invalid_type():
+    pytest.importorskip("longformer")
+    with pytest.raises(ValueError):
+        LongformerEncoder(20, input_layer="fff")
diff --git a/test/espnet2/asr/encoder/test_transformer_encoder.py b/test/espnet2/asr/encoder/test_transformer_encoder.py
index 82bb317dc9c..fadb4fd2ef2 100644
--- a/test/espnet2/asr/encoder/test_transformer_encoder.py
+++ b/test/espnet2/asr/encoder/test_transformer_encoder.py
@@ -1,29 +1,68 @@
 import pytest
 import torch
 
+from espnet2.asr.ctc import CTC
 from espnet2.asr.encoder.transformer_encoder import TransformerEncoder
 
 
 @pytest.mark.parametrize("input_layer", ["linear", "conv2d", "embed", None])
 @pytest.mark.parametrize("positionwise_layer_type", ["conv1d", "conv1d-linear"])
-def test_Encoder_forward_backward(input_layer, positionwise_layer_type):
+@pytest.mark.parametrize(
+    "interctc_layer_idx, interctc_use_conditioning",
+    [
+        ([], False),
+        ([1], False),
+        ([1], True),
+    ],
+)
+def test_Encoder_forward_backward(
+    input_layer,
+    positionwise_layer_type,
+    interctc_layer_idx,
+    interctc_use_conditioning,
+):
     encoder = TransformerEncoder(
         20,
         output_size=40,
         input_layer=input_layer,
         positionwise_layer_type=positionwise_layer_type,
+        interctc_layer_idx=interctc_layer_idx,
+        interctc_use_conditioning=interctc_use_conditioning,
     )
     if input_layer == "embed":
         x = torch.randint(0, 10, [2, 10])
-    elif input_layer is None:
-        x = torch.randn(2, 10, 40, requires_grad=True)
     else:
         x = torch.randn(2, 10, 20, requires_grad=True)
     x_lens = torch.LongTensor([10, 8])
-    y, _, _ = encoder(x, x_lens)
+    if len(interctc_layer_idx) > 0:
+        ctc = None
+        if interctc_use_conditioning:
+            vocab_size = 5
+            output_size = encoder.output_size()
+            ctc = CTC(odim=vocab_size, encoder_output_size=output_size)
+            encoder.conditioning_layer = torch.nn.Linear(vocab_size, output_size)
+        y, _, _ = encoder(x, x_lens, ctc=ctc)
+        y = y[0]
+    else:
+        y, _, _ = encoder(x, x_lens)
     y.sum().backward()
 
 
+def test_encoder_invalid_interctc_layer_idx():
+    with pytest.raises(AssertionError):
+        TransformerEncoder(
+            20,
+            num_blocks=2,
+            interctc_layer_idx=[0, 1],
+        )
+    with pytest.raises(AssertionError):
+        TransformerEncoder(
+            20,
+            num_blocks=2,
+            interctc_layer_idx=[1, 2],
+        )
+
+
 def test_Encoder_output_size():
     encoder = TransformerEncoder(20, output_size=256)
     assert encoder.output_size() == 256
diff --git a/test/espnet2/asr/frontend/test_s3prl.py b/test/espnet2/asr/frontend/test_s3prl.py
index 77564a21a91..0bfebb823b3 100644
--- a/test/espnet2/asr/frontend/test_s3prl.py
+++ b/test/espnet2/asr/frontend/test_s3prl.py
@@ -1,12 +1,22 @@
 from distutils.version import LooseVersion
-import os
 
 import torch
 
+from espnet2.asr.frontend.s3prl import S3prlFrontend
+
 is_torch_1_7_plus = LooseVersion(torch.__version__) >= LooseVersion("1.7.0")
 
-if is_torch_1_7_plus:
-    from s3prl.upstream.interfaces import Featurizer
+
+def test_frontend_init():
+    if not is_torch_1_7_plus:
+        return
+
+    frontend = S3prlFrontend(
+        fs=16000,
+        frontend_conf=dict(upstream="mel"),
+    )
+    assert frontend.frontend_type == "s3prl"
+    assert frontend.output_dim > 0
 
 
 def test_frontend_output_size():
@@ -14,28 +24,28 @@ def test_frontend_output_size():
     if not is_torch_1_7_plus:
         return
 
-    s3prl_path = None
-    python_path_list = os.environ.get("PYTHONPATH", "(None)").split(":")
-    for p in python_path_list:
-        if p.endswith("s3prl"):
-            s3prl_path = p
-            break
-    assert s3prl_path is not None
-
-    s3prl_upstream = torch.hub.load(
-        s3prl_path,
-        "mel",
-        source="local",
-    ).to("cpu")
-
-    feature_selection = "last_hidden_state"
-    s3prl_featurizer = Featurizer(
-        upstream=s3prl_upstream,
-        feature_selection=feature_selection,
-        upstream_device="cpu",
+    frontend = S3prlFrontend(
+        fs=16000,
+        frontend_conf=dict(upstream="mel"),
+        download_dir="./hub",
     )
 
-    wavs = [torch.randn(1600)]
-    feats = s3prl_upstream(wavs)
-    feats = s3prl_featurizer(wavs, feats)
-    assert feats[0].shape[-1] == 80
+    wavs = torch.randn(2, 1600)
+    lengths = torch.LongTensor([1600, 1600])
+    feats, _ = frontend(wavs, lengths)
+    assert feats.shape[-1] == frontend.output_dim
+
+
+def test_frontend_backward():
+    if not is_torch_1_7_plus:
+        return
+
+    frontend = S3prlFrontend(
+        fs=16000,
+        frontend_conf=dict(upstream="mel"),
+        download_dir="./hub",
+    )
+    wavs = torch.randn(2, 1600, requires_grad=True)
+    lengths = torch.LongTensor([1600, 1600])
+    feats, f_lengths = frontend(wavs, lengths)
+    feats.sum().backward()
diff --git a/test/espnet2/asr/specaug/test_specaug.py b/test/espnet2/asr/specaug/test_specaug.py
index 874c13a44b1..e0ab4b747aa 100644
--- a/test/espnet2/asr/specaug/test_specaug.py
+++ b/test/espnet2/asr/specaug/test_specaug.py
@@ -7,19 +7,43 @@
 @pytest.mark.parametrize("apply_time_warp", [False, True])
 @pytest.mark.parametrize("apply_freq_mask", [False, True])
 @pytest.mark.parametrize("apply_time_mask", [False, True])
-def test_SpecAuc(apply_time_warp, apply_freq_mask, apply_time_mask):
-    if not apply_time_warp and not apply_time_mask and not apply_freq_mask:
+@pytest.mark.parametrize("time_mask_width_range", [None, 100, (0, 100)])
+@pytest.mark.parametrize("time_mask_width_ratio_range", [None, 0.1, (0.0, 0.1)])
+def test_SpecAuc(
+    apply_time_warp,
+    apply_freq_mask,
+    apply_time_mask,
+    time_mask_width_range,
+    time_mask_width_ratio_range,
+):
+    if (
+        (not apply_time_warp and not apply_time_mask and not apply_freq_mask)
+        or (
+            apply_time_mask
+            and time_mask_width_range is None
+            and time_mask_width_ratio_range is None
+        )
+        or (
+            apply_time_mask
+            and time_mask_width_range is not None
+            and time_mask_width_ratio_range is not None
+        )
+    ):
         with pytest.raises(ValueError):
             specaug = SpecAug(
                 apply_time_warp=apply_time_warp,
                 apply_freq_mask=apply_freq_mask,
                 apply_time_mask=apply_time_mask,
+                time_mask_width_range=time_mask_width_range,
+                time_mask_width_ratio_range=time_mask_width_ratio_range,
             )
     else:
         specaug = SpecAug(
             apply_time_warp=apply_time_warp,
             apply_freq_mask=apply_freq_mask,
             apply_time_mask=apply_time_mask,
+            time_mask_width_range=time_mask_width_range,
+            time_mask_width_ratio_range=time_mask_width_ratio_range,
         )
         x = torch.randn(2, 1000, 80)
         specaug(x)
@@ -28,12 +52,34 @@ def test_SpecAuc(apply_time_warp, apply_freq_mask, apply_time_mask):
 @pytest.mark.parametrize("apply_time_warp", [False, True])
 @pytest.mark.parametrize("apply_freq_mask", [False, True])
 @pytest.mark.parametrize("apply_time_mask", [False, True])
-def test_SpecAuc_repr(apply_time_warp, apply_freq_mask, apply_time_mask):
-    if not apply_time_warp and not apply_time_mask and not apply_freq_mask:
+@pytest.mark.parametrize("time_mask_width_range", [None, 100, (0, 100)])
+@pytest.mark.parametrize("time_mask_width_ratio_range", [None, 0.1, (0.0, 0.1)])
+def test_SpecAuc_repr(
+    apply_time_warp,
+    apply_freq_mask,
+    apply_time_mask,
+    time_mask_width_range,
+    time_mask_width_ratio_range,
+):
+    if (
+        (not apply_time_warp and not apply_time_mask and not apply_freq_mask)
+        or (
+            apply_time_mask
+            and time_mask_width_range is None
+            and time_mask_width_ratio_range is None
+        )
+        or (
+            apply_time_mask
+            and time_mask_width_range is not None
+            and time_mask_width_ratio_range is not None
+        )
+    ):
         return
     specaug = SpecAug(
         apply_time_warp=apply_time_warp,
         apply_freq_mask=apply_freq_mask,
         apply_time_mask=apply_time_mask,
+        time_mask_width_range=time_mask_width_range,
+        time_mask_width_ratio_range=time_mask_width_ratio_range,
     )
     print(specaug)
diff --git a/test/espnet2/asr/test_ctc.py b/test/espnet2/asr/test_ctc.py
index a218e6d6815..5e17121415d 100644
--- a/test/espnet2/asr/test_ctc.py
+++ b/test/espnet2/asr/test_ctc.py
@@ -18,15 +18,23 @@ def ctc_args():
 def test_ctc_forward_backward(ctc_type, ctc_args):
     if ctc_type == "warpctc":
         pytest.importorskip("warpctc_pytorch")
-    ctc = CTC(encoder_output_sizse=10, odim=5, ctc_type=ctc_type)
+    ctc = CTC(encoder_output_size=10, odim=5, ctc_type=ctc_type)
     ctc(*ctc_args).sum().backward()
 
 
+@pytest.mark.parametrize("ctc_type", ["builtin", "warpctc", "gtnctc"])
+def test_ctc_softmax(ctc_type, ctc_args):
+    if ctc_type == "warpctc":
+        pytest.importorskip("warpctc_pytorch")
+    ctc = CTC(encoder_output_size=10, odim=5, ctc_type=ctc_type)
+    ctc.softmax(ctc_args[0])
+
+
 @pytest.mark.parametrize("ctc_type", ["builtin", "warpctc", "gtnctc"])
 def test_ctc_log_softmax(ctc_type, ctc_args):
     if ctc_type == "warpctc":
         pytest.importorskip("warpctc_pytorch")
-    ctc = CTC(encoder_output_sizse=10, odim=5, ctc_type=ctc_type)
+    ctc = CTC(encoder_output_size=10, odim=5, ctc_type=ctc_type)
     ctc.log_softmax(ctc_args[0])
 
 
@@ -34,5 +42,5 @@ def test_ctc_log_softmax(ctc_type, ctc_args):
 def test_ctc_argmax(ctc_type, ctc_args):
     if ctc_type == "warpctc":
         pytest.importorskip("warpctc_pytorch")
-    ctc = CTC(encoder_output_sizse=10, odim=5, ctc_type=ctc_type)
+    ctc = CTC(encoder_output_size=10, odim=5, ctc_type=ctc_type)
     ctc.argmax(ctc_args[0])
diff --git a/test/espnet2/asr/test_maskctc_model.py b/test/espnet2/asr/test_maskctc_model.py
new file mode 100644
index 00000000000..4631f9be539
--- /dev/null
+++ b/test/espnet2/asr/test_maskctc_model.py
@@ -0,0 +1,77 @@
+import pytest
+import torch
+
+from espnet2.asr.ctc import CTC
+from espnet2.asr.decoder.mlm_decoder import MLMDecoder
+from espnet2.asr.encoder.conformer_encoder import ConformerEncoder
+from espnet2.asr.encoder.transformer_encoder import TransformerEncoder
+from espnet2.asr.maskctc_model import MaskCTCInference
+from espnet2.asr.maskctc_model import MaskCTCModel
+
+
+@pytest.mark.parametrize("encoder_arch", [TransformerEncoder, ConformerEncoder])
+@pytest.mark.parametrize(
+    "interctc_layer_idx, interctc_use_conditioning, interctc_weight",
+    [
+        ([], False, 0.0),
+        ([1], True, 0.5),
+    ],
+)
+def test_maskctc(
+    encoder_arch, interctc_layer_idx, interctc_use_conditioning, interctc_weight
+):
+    vocab_size = 5
+    enc_out = 4
+    encoder = encoder_arch(
+        20,
+        output_size=enc_out,
+        linear_units=4,
+        num_blocks=2,
+        interctc_layer_idx=interctc_layer_idx,
+        interctc_use_conditioning=interctc_use_conditioning,
+    )
+    decoder = MLMDecoder(
+        vocab_size,
+        enc_out,
+        linear_units=4,
+        num_blocks=2,
+    )
+    ctc = CTC(odim=vocab_size, encoder_output_size=enc_out)
+
+    model = MaskCTCModel(
+        vocab_size,
+        token_list=["<blank>", "<unk>", "a", "i", "<eos>"],
+        frontend=None,
+        specaug=None,
+        normalize=None,
+        preencoder=None,
+        encoder=encoder,
+        postencoder=None,
+        decoder=decoder,
+        ctc=ctc,
+        interctc_weight=interctc_weight,
+    )
+
+    inputs = dict(
+        speech=torch.randn(2, 10, 20, requires_grad=True),
+        speech_lengths=torch.tensor([10, 8], dtype=torch.long),
+        text=torch.randint(2, 4, [2, 4], dtype=torch.long),
+        text_lengths=torch.tensor([4, 3], dtype=torch.long),
+    )
+    loss, *_ = model(**inputs)
+    loss.backward()
+
+    with torch.no_grad():
+        model.eval()
+
+        s2t = MaskCTCInference(
+            asr_model=model,
+            n_iterations=2,
+            threshold_probability=0.5,
+        )
+
+        # free running
+        inputs = dict(
+            enc_out=torch.randn(2, 4),
+        )
+        s2t(**inputs)
diff --git a/test/espnet2/asr/transducer/test_beam_search_transducer.py b/test/espnet2/asr/transducer/test_beam_search_transducer.py
index 632a7784d19..d89b00c29a1 100644
--- a/test/espnet2/asr/transducer/test_beam_search_transducer.py
+++ b/test/espnet2/asr/transducer/test_beam_search_transducer.py
@@ -7,6 +7,7 @@
 from espnet2.lm.seq_rnn_lm import SequentialRNNLM
 
 
+@pytest.mark.execution_timeout(5)
 @pytest.mark.parametrize("rnn_type", ["lstm", "gru"])
 @pytest.mark.parametrize(
     "search_params",
diff --git a/test/espnet2/bin/test_asr_inference.py b/test/espnet2/bin/test_asr_inference.py
index 9cd9b2a232f..7e7a18d9a1e 100644
--- a/test/espnet2/bin/test_asr_inference.py
+++ b/test/espnet2/bin/test_asr_inference.py
@@ -10,6 +10,7 @@
 from espnet2.bin.asr_inference import main
 from espnet2.bin.asr_inference import Speech2Text
 from espnet2.tasks.asr import ASRTask
+from espnet2.tasks.enh_s2t import EnhS2TTask
 from espnet2.tasks.lm import LMTask
 
 
@@ -118,3 +119,38 @@ def test_Speech2Text_streaming(asr_config_file_streaming, lm_config_file):
         assert isinstance(token[0], str)
         assert isinstance(token_int[0], int)
         assert isinstance(hyp, Hypothesis)
+
+
+@pytest.fixture()
+def enh_asr_config_file(tmp_path: Path, token_list):
+    # Write default configuration file
+    EnhS2TTask.main(
+        cmd=[
+            "--dry_run",
+            "true",
+            "--output_dir",
+            str(tmp_path / "enh_asr"),
+            "--token_list",
+            str(token_list),
+            "--token_type",
+            "char",
+        ]
+    )
+    return tmp_path / "enh_asr" / "config.yaml"
+
+
+@pytest.mark.execution_timeout(5)
+def test_EnhS2T_Speech2Text(enh_asr_config_file, lm_config_file):
+    speech2text = Speech2Text(
+        asr_train_config=enh_asr_config_file,
+        lm_train_config=lm_config_file,
+        beam_size=1,
+        enh_s2t_task=True,
+    )
+    speech = np.random.randn(48000)
+    results = speech2text(speech)
+    for text, token, token_int, hyp in results:
+        assert isinstance(text, str)
+        assert isinstance(token[0], str)
+        assert isinstance(token_int[0], int)
+        assert isinstance(hyp, Hypothesis)
diff --git a/test/espnet2/bin/test_asr_inference_maskctc.py b/test/espnet2/bin/test_asr_inference_maskctc.py
new file mode 100644
index 00000000000..21a1d0392b4
--- /dev/null
+++ b/test/espnet2/bin/test_asr_inference_maskctc.py
@@ -0,0 +1,68 @@
+from argparse import ArgumentParser
+from pathlib import Path
+import string
+
+import numpy as np
+import pytest
+
+from espnet.nets.beam_search import Hypothesis
+from espnet2.bin.asr_inference_maskctc import get_parser
+from espnet2.bin.asr_inference_maskctc import main
+from espnet2.bin.asr_inference_maskctc import Speech2Text
+from espnet2.tasks.asr import ASRTask
+
+
+def test_get_parser():
+    assert isinstance(get_parser(), ArgumentParser)
+
+
+def test_main():
+    with pytest.raises(SystemExit):
+        main()
+
+
+@pytest.fixture()
+def token_list(tmp_path: Path):
+    with (tmp_path / "tokens.txt").open("w") as f:
+        f.write("<blank>\n")
+        for c in string.ascii_letters:
+            f.write(f"{c}\n")
+        f.write("<unk>\n")
+        f.write("<sos/eos>\n")
+    return tmp_path / "tokens.txt"
+
+
+@pytest.fixture()
+def asr_config_file(tmp_path: Path, token_list):
+    # Write default configuration file
+    ASRTask.main(
+        cmd=[
+            "--dry_run",
+            "true",
+            "--output_dir",
+            str(tmp_path / "asr"),
+            "--token_list",
+            str(token_list),
+            "--token_type",
+            "char",
+            "--model",
+            "maskctc",
+            "--encoder",
+            "transformer",
+            "--decoder",
+            "mlm",
+        ]
+    )
+    return tmp_path / "asr" / "config.yaml"
+
+
+@pytest.mark.execution_timeout(5)
+def test_Speech2Text(asr_config_file):
+    speech2text = Speech2Text(asr_train_config=asr_config_file)
+    speech = np.random.randn(100000)
+    results = speech2text(speech)
+    for text, token, token_int, hyp in results:
+        assert isinstance(text, str)
+        assert isinstance(token[0], str)
+        assert isinstance(token_int[0], int)
+        assert isinstance(hyp, Hypothesis)
diff --git a/test/espnet2/bin/test_diar_inference.py b/test/espnet2/bin/test_diar_inference.py
new file mode 100644
index 00000000000..8781200eb72
--- /dev/null
+++ b/test/espnet2/bin/test_diar_inference.py
@@ -0,0 +1,54 @@
+from argparse import ArgumentParser
+from pathlib import Path
+
+import pytest
+import torch
+
+from espnet2.bin.diar_inference import DiarizeSpeech
+from espnet2.bin.diar_inference import get_parser
+from espnet2.bin.diar_inference import main
+from espnet2.tasks.diar import DiarizationTask
+
+
+def test_get_parser():
+    assert isinstance(get_parser(), ArgumentParser)
+
+
+def test_main():
+    with pytest.raises(SystemExit):
+        main()
+
+
+@pytest.fixture()
+def config_file(tmp_path: Path):
+    # Write default configuration file
+    DiarizationTask.main(
+        cmd=[
+            "--dry_run",
+            "true",
+            "--output_dir",
+            str(tmp_path),
+            "--num_spk",
+            "2",
+        ]
+    )
+    return tmp_path / "config.yaml"
+
+
+@pytest.mark.execution_timeout(5)
+@pytest.mark.parametrize("batch_size", [1])
+@pytest.mark.parametrize(
+    "input_size, segment_size, normalize_segment_scale, num_spk",
+    [(16000, None, False, 2), (35000, 2.4, False, 2), (34000, 2.4, True, 2)],
+)
+def test_DiarizeSpeech(
+    config_file, batch_size, input_size, segment_size, normalize_segment_scale, num_spk
+):
+    diarize_speech = DiarizeSpeech(
+        train_config=config_file,
+        segment_size=segment_size,
+        normalize_segment_scale=normalize_segment_scale,
+        num_spk=num_spk,
+    )
+    wav = torch.rand(batch_size, input_size)
+    diarize_speech(wav, fs=8000)
diff --git a/test/espnet2/bin/test_diar_train.py b/test/espnet2/bin/test_diar_train.py
new file mode 100644
index 00000000000..9f0cd5dff2f
--- /dev/null
+++ b/test/espnet2/bin/test_diar_train.py
@@ -0,0 +1,15 @@
+from argparse import ArgumentParser
+
+import pytest
+
+from espnet2.bin.diar_train import get_parser
+from espnet2.bin.diar_train import main
+
+
+def test_get_parser():
+    assert isinstance(get_parser(), ArgumentParser)
+
+
+def test_main():
+    with pytest.raises(SystemExit):
+        main()
diff --git a/test/espnet2/bin/test_enh_inference.py b/test/espnet2/bin/test_enh_inference.py
index 5150e25823e..2bad3cae4ea 100644
--- a/test/espnet2/bin/test_enh_inference.py
+++ b/test/espnet2/bin/test_enh_inference.py
@@ -1,13 +1,19 @@
 from argparse import ArgumentParser
 from pathlib import Path
+import string
 
 import pytest
 import torch
+import yaml
 
 from espnet2.bin.enh_inference import get_parser
 from espnet2.bin.enh_inference import main
 from espnet2.bin.enh_inference import SeparateSpeech
+from espnet2.enh.encoder.stft_encoder import STFTEncoder
 from espnet2.tasks.enh import EnhancementTask
+from espnet2.tasks.enh_s2t import EnhS2TTask
+from espnet2.utils.get_default_kwargs import get_default_kwargs
+from espnet2.utils.yaml_no_alias_safe_dump import yaml_no_alias_safe_dump
 
 
 def test_get_parser():
@@ -27,10 +33,20 @@ def config_file(tmp_path: Path):
             "--dry_run",
             "true",
             "--output_dir",
-            str(tmp_path),
+            str(tmp_path / "enh"),
         ]
     )
-    return tmp_path / "config.yaml"
+
+    with open(tmp_path / "enh" / "config.yaml", "r") as f:
+        args = yaml.safe_load(f)
+
+    if args["encoder"] == "stft" and len(args["encoder_conf"]) == 0:
+        args["encoder_conf"] = get_default_kwargs(STFTEncoder)
+
+    with open(tmp_path / "enh" / "config.yaml", "w") as f:
+        yaml_no_alias_safe_dump(args, f, indent=4, sort_keys=False)
+
+    return tmp_path / "enh" / "config.yaml"
 
 
 @pytest.mark.execution_timeout(5)
@@ -50,3 +66,178 @@ def test_SeparateSpeech(
     )
     wav = torch.rand(batch_size, input_size)
     separate_speech(wav, fs=8000)
+
+
+@pytest.fixture()
+def enh_inference_config(tmp_path: Path):
+    # Write default configuration file
+    args = {
+        "encoder": "stft",
+        "encoder_conf": {"n_fft": 64, "hop_length": 32},
+        "decoder": "stft",
+        "decoder_conf": {"n_fft": 64, "hop_length": 32},
+    }
+    (tmp_path / "enh").mkdir(parents=True, exist_ok=True)
+    with open(tmp_path / "enh" / "inference.yaml", "w") as f:
+        yaml_no_alias_safe_dump(args, f, indent=4, sort_keys=False)
+    return tmp_path / "enh" / "inference.yaml"
+
+
+@pytest.fixture()
+def invalid_enh_inference_config(tmp_path: Path):
+    # Write default configuration file
+    args = {
+        "encoder": "stft",
+        "encoder_conf": {"n_fft": 64, "hop_length": 32},
+        "xxx": "invalid",
+    }
+    (tmp_path / "enh").mkdir(parents=True, exist_ok=True)
+    with open(tmp_path / "enh" / "invalid_inference.yaml", "w") as f:
+        yaml_no_alias_safe_dump(args, f, indent=4, sort_keys=False)
+    return tmp_path / "enh" / "invalid_inference.yaml"
+
+
+@pytest.mark.execution_timeout(5)
+def test_SeparateSpeech_with_inference_config(config_file, enh_inference_config):
+    separate_speech = SeparateSpeech(
+        train_config=config_file, inference_config=enh_inference_config
+    )
+    wav = torch.rand(2, 16000)
+    separate_speech(wav, fs=8000)
+
+
+def test_SeparateSpeech_invalid_inference_config(
+    enh_inference_config, invalid_enh_inference_config
+):
+    with pytest.raises(AssertionError):
+        SeparateSpeech(
+            train_config=None, model_file=None, inference_config=enh_inference_config
+        )
+
+    with pytest.raises(AssertionError):
+        SeparateSpeech(train_config=None, inference_config=invalid_enh_inference_config)
+
+
+@pytest.fixture()
+def token_list(tmp_path: Path):
+    with (tmp_path / "tokens.txt").open("w") as f:
+        f.write("<blank>\n")
+        for c in string.ascii_letters:
+            f.write(f"{c}\n")
+        f.write("<unk>\n")
+        f.write("<sos/eos>\n")
+    return tmp_path / "tokens.txt"
+
+
+@pytest.fixture()
+def enh_s2t_config_file(tmp_path: Path, token_list):
+    # Write default configuration file
+    EnhS2TTask.main(
+        cmd=[
+            "--dry_run",
+            "true",
+            "--output_dir",
+            str(tmp_path / "enh_s2t"),
+            "--token_list",
+            str(token_list),
+            "--token_type",
+            "char",
+        ]
+    )
+
+    with open(tmp_path / "enh_s2t" / "config.yaml", "r") as f:
+        args = yaml.safe_load(f)
+
+    if args["enh_encoder"] == "stft" and len(args["enh_encoder_conf"]) == 0:
+        args["enh_encoder_conf"] = get_default_kwargs(STFTEncoder)
+
+    with open(tmp_path / "enh_s2t" / "config.yaml", "w") as f:
+        yaml_no_alias_safe_dump(args, f, indent=4, sort_keys=False)
+
+    return tmp_path / "enh_s2t" / "config.yaml"
+
+
+@pytest.mark.execution_timeout(5)
+@pytest.mark.parametrize("batch_size", [1, 2])
+@pytest.mark.parametrize(
+    "input_size, segment_size, hop_size, normalize_segment_scale",
+    [(16000, None, None, False), (35000, 2.4, 0.8, False), (35000, 2.4, 0.8, True)],
+)
+def test_enh_s2t_SeparateSpeech(
+    enh_s2t_config_file,
+    batch_size,
+    input_size,
+    segment_size,
+    hop_size,
+    normalize_segment_scale,
+):
+    separate_speech = SeparateSpeech(
+        train_config=enh_s2t_config_file,
+        segment_size=segment_size,
+        hop_size=hop_size,
+        normalize_segment_scale=normalize_segment_scale,
+        enh_s2t_task=True,
+    )
+    wav = torch.rand(batch_size, input_size)
+    separate_speech(wav, fs=8000)
+
+
+@pytest.fixture()
+def enh_s2t_inference_config(tmp_path: Path):
+    # Write default configuration file
+    args = {
+        "enh_encoder": "stft",
+        "enh_encoder_conf": {"n_fft": 64, "hop_length": 32},
+        "enh_decoder": "stft",
+        "enh_decoder_conf": {"n_fft": 64, "hop_length": 32},
+    }
+    (tmp_path / "enh_s2t").mkdir(parents=True, exist_ok=True)
+    with open(tmp_path / "enh_s2t" / "inference.yaml", "w") as f:
+        yaml_no_alias_safe_dump(args, f, indent=4, sort_keys=False)
+    return tmp_path / "enh_s2t" / "inference.yaml"
+
+
+@pytest.fixture()
+def invalid_enh_s2t_inference_config(tmp_path: Path):
+    # Write default configuration file
+    args = {
+        "enh_encoder": "stft",
+        "enh_encoder_conf": {"n_fft": 64, "hop_length": 32},
+        "xxx": "invalid",
+    }
+    (tmp_path / "enh_s2t").mkdir(parents=True, exist_ok=True)
+    with open(tmp_path / "enh_s2t" / "invalid_inference.yaml", "w") as f:
+        yaml_no_alias_safe_dump(args, f, indent=4, sort_keys=False)
+    return tmp_path / "enh_s2t" / "invalid_inference.yaml"
+
+
+@pytest.mark.execution_timeout(5)
+def test_enh_s2t_SeparateSpeech_with_inference_config(
+    enh_s2t_config_file, enh_s2t_inference_config
+):
+    separate_speech = SeparateSpeech(
+        train_config=enh_s2t_config_file,
+        inference_config=enh_s2t_inference_config,
+        enh_s2t_task=True,
+    )
+    wav = torch.rand(2, 16000)
+    separate_speech(wav, fs=8000)
+
+
+def test_enh_s2t_SeparateSpeech_invalid_inference_config(
+    enh_s2t_inference_config, invalid_enh_s2t_inference_config
+):
+    with pytest.raises(AssertionError):
+        SeparateSpeech(
+            train_config=None,
+            model_file=None,
+            inference_config=enh_s2t_inference_config,
+            enh_s2t_task=True,
+        )
+
+    with pytest.raises(AssertionError):
+        SeparateSpeech(
+            train_config=None,
+            inference_config=invalid_enh_s2t_inference_config,
+            enh_s2t_task=True,
+        )
diff --git a/test/espnet2/bin/test_enh_s2t_train.py b/test/espnet2/bin/test_enh_s2t_train.py
new file mode 100644
index 00000000000..2cd4fe6f94f
--- /dev/null
+++ b/test/espnet2/bin/test_enh_s2t_train.py
@@ -0,0 +1,15 @@
+from argparse import ArgumentParser
+
+import pytest
+
+from espnet2.bin.enh_s2t_train import get_parser
+from espnet2.bin.enh_s2t_train import main
+
+
+def test_get_parser():
+    assert isinstance(get_parser(), ArgumentParser)
+
+
+def test_main():
+    with pytest.raises(SystemExit):
+        main()
diff --git a/test/espnet2/bin/test_st_inference.py b/test/espnet2/bin/test_st_inference.py
new file mode 100644
index 00000000000..3910479456b
--- /dev/null
+++ b/test/espnet2/bin/test_st_inference.py
@@ -0,0 +1,75 @@
+from argparse import ArgumentParser
+from pathlib import Path
+import string
+
+import numpy as np
+import pytest
+
+from espnet.nets.beam_search import Hypothesis
+from espnet2.bin.st_inference import get_parser
+from espnet2.bin.st_inference import main
+from espnet2.bin.st_inference import Speech2Text
+from espnet2.tasks.st import STTask
+
+
+def test_get_parser():
+    assert isinstance(get_parser(), ArgumentParser)
+
+
+def test_main():
+    with pytest.raises(SystemExit):
+        main()
+
+
+@pytest.fixture()
+def token_list(tmp_path: Path):
+    with (tmp_path / "tokens.txt").open("w") as f:
+        f.write("<blank>\n")
+        for c in string.ascii_letters:
+            f.write(f"{c}\n")
+        f.write("<unk>\n")
+        f.write("<sos/eos>\n")
+    return tmp_path / "tokens.txt"
+
+
+@pytest.fixture()
+def src_token_list(tmp_path: Path):
+    with (tmp_path / "src_tokens.txt").open("w") as f:
+        f.write("<blank>\n")
+        for c in string.ascii_letters:
+            f.write(f"{c}\n")
+        f.write("<unk>\n")
+        f.write("<sos/eos>\n")
+    return tmp_path / "src_tokens.txt"
+
+
+@pytest.fixture()
+def st_config_file(tmp_path: Path, token_list, src_token_list):
+    # Write default configuration file
+    STTask.main(
+        cmd=[
+            "--dry_run",
+            "true",
+            "--output_dir",
+            str(tmp_path / "st"),
+            "--token_list",
+            str(token_list),
+            "--src_token_list",
+            str(src_token_list),
+            "--token_type",
+            "char",
+        ]
+    )
+    return tmp_path / "st" / "config.yaml"
+
+
+@pytest.mark.execution_timeout(5)
+def test_Speech2Text(st_config_file):
+    speech2text = Speech2Text(st_train_config=st_config_file, beam_size=1)
+    speech = np.random.randn(1000)
+    results = speech2text(speech)
+    for text, token, token_int, hyp in results:
+        assert isinstance(text, str)
+        assert isinstance(token[0], str)
+        assert isinstance(token_int[0], int)
+        assert isinstance(hyp, Hypothesis)
diff --git a/test/espnet2/bin/test_st_train.py b/test/espnet2/bin/test_st_train.py
new file mode 100644
index 00000000000..5be899f0a38
--- /dev/null
+++ b/test/espnet2/bin/test_st_train.py
@@ -0,0 +1,15 @@
+from argparse import ArgumentParser
+
+import pytest
+
+from espnet2.bin.st_train import get_parser
+from espnet2.bin.st_train import main
+
+
+def test_get_parser():
+    assert isinstance(get_parser(), ArgumentParser)
+
+
+def test_main():
+    with pytest.raises(SystemExit):
+        main()
diff --git a/test/espnet2/diar/attractor/test_rnn_attractor.py b/test/espnet2/diar/attractor/test_rnn_attractor.py
new file mode 100644
index 00000000000..a65862e3254
--- /dev/null
+++ b/test/espnet2/diar/attractor/test_rnn_attractor.py
@@ -0,0 +1,27 @@
+import pytest
+import torch
+
+from espnet2.diar.attractor.rnn_attractor import RnnAttractor
+
+
+@pytest.mark.parametrize("encoder_output_size", [10])
+@pytest.mark.parametrize("layer", [1])
+@pytest.mark.parametrize("unit", [10])
+@pytest.mark.parametrize("dropout", [0.1])
+def test_rnn_attractor(encoder_output_size, layer, unit, dropout):
+    eda = RnnAttractor(
+        encoder_output_size=encoder_output_size,
+        layer=layer,
+        unit=unit,
+        dropout=dropout,
+    )
+    enc_input = torch.rand(5, 100, encoder_output_size)
+    ilens = torch.tensor([100, 100, 100, 100, 100])
+    dec_input = torch.zeros(5, 3, encoder_output_size)
+    attractor, att_prob = eda.forward(
+        enc_input=enc_input,
+        ilens=ilens,
+        dec_input=dec_input,
+    )
+    assert attractor.shape == (5, 3, encoder_output_size)
+    assert att_prob.shape == (5, 3, 1)
diff --git a/test/espnet2/diar/decoder/test_linear_decoder.py b/test/espnet2/diar/decoder/test_linear_decoder.py
new file mode 100644
index 00000000000..652f3582a4a
--- /dev/null
+++ b/test/espnet2/diar/decoder/test_linear_decoder.py
@@ -0,0 +1,16 @@
+import pytest
+import torch
+
+from espnet2.diar.decoder.linear_decoder import LinearDecoder
+
+
+@pytest.mark.parametrize("encoder_output_size", [10])
+@pytest.mark.parametrize("num_spk", [2])
+def test_linear_decoder(encoder_output_size, num_spk):
+    linear_decoder = LinearDecoder(
+        encoder_output_size=encoder_output_size, num_spk=num_spk
+    )
+    input = torch.rand(5, 100, encoder_output_size)
+    ilens = torch.tensor([100, 100, 100, 100, 100])
+    output = linear_decoder.forward(input=input, ilens=ilens)
+    assert output.shape == (5, 100, num_spk)
diff --git a/test/espnet2/enh/layers/test_complex_utils.py b/test/espnet2/enh/layers/test_complex_utils.py
index 6b6ed2ddef6..e566f3aea76 100644
--- a/test/espnet2/enh/layers/test_complex_utils.py
+++ b/test/espnet2/enh/layers/test_complex_utils.py
@@ -6,10 +6,14 @@
 import torch_complex.functional as FC
 from torch_complex.tensor import ComplexTensor
 
+from espnet2.enh.layers.complex_utils import cat
+from espnet2.enh.layers.complex_utils import complex_norm
 from espnet2.enh.layers.complex_utils import einsum
 from espnet2.enh.layers.complex_utils import inverse
 from espnet2.enh.layers.complex_utils import matmul
 from espnet2.enh.layers.complex_utils import solve
+from espnet2.enh.layers.complex_utils import stack
+from espnet2.enh.layers.complex_utils import trace
 
 
 is_torch_1_9_plus = LooseVersion(torch.__version__) >= LooseVersion("1.9.0")
@@ -31,6 +35,35 @@
 )
 
 
+@pytest.mark.parametrize("dim", [0, 1, 2])
+def test_cat(dim):
+    if is_torch_1_9_plus:
+        wrappers = [ComplexTensor, torch.complex]
+        modules = [FC, torch]
+    else:
+        wrappers = [ComplexTensor]
+        modules = [FC]
+
+    for complex_wrapper, complex_module in zip(wrappers, modules):
+        mat1 = complex_wrapper(torch.rand(2, 3, 4), torch.rand(2, 3, 4))
+        mat2 = complex_wrapper(torch.rand(2, 3, 4), torch.rand(2, 3, 4))
+        ret = cat([mat1, mat2], dim=dim)
+        ret2 = complex_module.cat([mat1, mat2], dim=dim)
+        assert complex_module.allclose(ret, ret2)
+
+
+@pytest.mark.parametrize("dim", [None, 0, 1, 2])
+@pytest.mark.skipif(not is_torch_1_9_plus, reason="Require torch 1.9.0+")
+def test_complex_norm(dim):
+    mat = ComplexTensor(torch.rand(2, 3, 4), torch.rand(2, 3, 4))
+    mat_th = torch.complex(mat.real, mat.imag)
+    norm = complex_norm(mat, dim=dim, keepdim=True)
+    norm_th = complex_norm(mat_th, dim=dim, keepdim=True)
+    assert torch.allclose(norm, norm_th)
+    if dim is not None:
+        assert norm.ndim == mat.ndim and mat.numel() == norm.numel() * mat.size(dim)
+
+
 @pytest.mark.parametrize("real_vec", [True, False])
 def test_einsum(real_vec):
     if is_torch_1_9_plus:
@@ -92,6 +125,21 @@ def test_matmul(real_vec):
         assert complex_module.allclose(ret, ret2)
 
 
+def test_trace():
+    if is_torch_1_9_plus:
+        wrappers = [ComplexTensor, torch.complex]
+        modules = [FC, torch]
+    else:
+        wrappers = [ComplexTensor]
+        modules = [FC]
+
+    for complex_wrapper, complex_module in zip(wrappers, modules):
+        mat = complex_wrapper(torch.rand(2, 3, 3), torch.rand(2, 3, 3))
+        tr = trace(mat)
+        tr2 = sum([mat[..., i, i] for i in range(mat.size(-1))])
+        assert complex_module.allclose(tr, tr2)
+
+
 @pytest.mark.parametrize("real_vec", [True, False])
 def test_solve(real_vec):
     if is_torch_1_9_plus:
@@ -112,7 +160,28 @@ def test_solve(real_vec):
             vec = torch.rand(2, 3, 1)
             vec2 = complex_wrapper(vec, torch.zeros_like(vec))
         ret = solve(vec, mat)
-        ret2 = complex_module.solve(vec2, mat)[0]
+        if isinstance(vec2, ComplexTensor):
+            ret2 = FC.solve(vec2, mat, return_LU=False)
+        else:
+            return torch.linalg.solve(mat, vec2)
+        assert complex_module.allclose(ret, ret2)
+
+
+@pytest.mark.parametrize("dim", [0, 1, 2])
+def test_stack(dim):
+    if is_torch_1_9_plus:
+        wrappers = [ComplexTensor, torch.complex]
+        modules = [FC, torch]
+    else:
+        wrappers = [ComplexTensor]
+        modules = [FC]
+
+    for complex_wrapper, complex_module in zip(wrappers, modules):
+        print(complex_wrapper, complex_module)
+        mat1 = complex_wrapper(torch.rand(2, 3, 4), torch.rand(2, 3, 4))
+        mat2 = complex_wrapper(torch.rand(2, 3, 4), torch.rand(2, 3, 4))
+        ret = stack([mat1, mat2], dim=dim)
+        ret2 = complex_module.stack([mat1, mat2], dim=dim)
         assert complex_module.allclose(ret, ret2)
 
 
diff --git a/test/espnet2/enh/layers/test_conv_utils.py b/test/espnet2/enh/layers/test_conv_utils.py
new file mode 100644
index 00000000000..7e7ea22672c
--- /dev/null
+++ b/test/espnet2/enh/layers/test_conv_utils.py
@@ -0,0 +1,63 @@
+import pytest
+import torch
+
+from espnet2.enh.layers.conv_utils import conv2d_output_shape
+from espnet2.enh.layers.conv_utils import convtransp2d_output_shape
+
+
+@pytest.mark.parametrize("input_dim", [(10, 17), (10, 33)])
+@pytest.mark.parametrize("kernel_size", [(1, 3), (3, 5)])
+@pytest.mark.parametrize("stride", [(1, 1), (1, 2)])
+@pytest.mark.parametrize("padding", [(0, 0), (0, 1)])
+@pytest.mark.parametrize("dilation", [(1, 1), (1, 2)])
+def test_conv2d_output_shape(input_dim, kernel_size, stride, padding, dilation):
+    h, w = conv2d_output_shape(
+        input_dim,
+        kernel_size=kernel_size,
+        stride=stride,
+        pad=padding,
+        dilation=dilation,
+    )
+    conv = torch.nn.Conv2d(
+        1, 1, kernel_size, stride=stride, padding=padding, dilation=dilation
+    )
+    x = torch.rand(1, 1, *input_dim)
+    assert conv(x).shape[2:] == (h, w)
+
+
+@pytest.mark.parametrize("input_dim", [(10, 17), (10, 33)])
+@pytest.mark.parametrize("kernel_size", [(1, 3), (3, 5)])
+@pytest.mark.parametrize("stride", [(1, 1), (1, 2)])
+@pytest.mark.parametrize("padding", [(0, 0), (0, 1)])
+@pytest.mark.parametrize("output_padding", [(0, 0), (0, 1)])
+@pytest.mark.parametrize("dilation", [(1, 1), (1, 2)])
+def test_deconv2d_output_shape(
+    input_dim, kernel_size, stride, padding, output_padding, dilation
+):
+    if (
+        output_padding[0] >= stride[0]
+        or output_padding[0] >= dilation[0]
+        or output_padding[1] >= stride[1]
+        or output_padding[1] >= dilation[1]
+    ):
+        # skip invalid cases
+        return
+    h, w = convtransp2d_output_shape(
+        input_dim,
+        kernel_size=kernel_size,
+        stride=stride,
+        pad=padding,
+        dilation=dilation,
+        out_pad=output_padding,
+    )
+    deconv = torch.nn.ConvTranspose2d(
+        1,
+        1,
+        kernel_size,
+        stride=stride,
+        padding=padding,
+        output_padding=output_padding,
+        dilation=dilation,
+    )
+    x = torch.rand(1, 1, *input_dim)
+    assert deconv(x).shape[2:] == (h, w)
diff --git a/test/espnet2/enh/layers/test_enh_layers.py b/test/espnet2/enh/layers/test_enh_layers.py
index 33e6d381240..62f4554b10b 100644
--- a/test/espnet2/enh/layers/test_enh_layers.py
+++ b/test/espnet2/enh/layers/test_enh_layers.py
@@ -1,15 +1,20 @@
 from distutils.version import LooseVersion
 
+import numpy as np
 import pytest
 import torch
 import torch_complex.functional as FC
 from torch_complex.tensor import ComplexTensor
 
+from espnet2.enh.layers.beamformer import generalized_eigenvalue_decomposition
 from espnet2.enh.layers.beamformer import get_rtf
+from espnet2.enh.layers.beamformer import gev_phase_correction
 from espnet2.enh.layers.beamformer import signal_framing
+from espnet2.enh.layers.complex_utils import solve
 from espnet2.layers.stft import Stft
 
 is_torch_1_1_plus = LooseVersion(torch.__version__) >= LooseVersion("1.1.0")
+is_torch_1_9_plus = LooseVersion(torch.__version__) >= LooseVersion("1.9.0")
 
 
 random_speech = torch.tensor(
@@ -56,7 +61,17 @@
 
 
 @pytest.mark.parametrize("ch", [2, 4, 6, 8])
-def test_get_rtf(ch):
+@pytest.mark.parametrize("mode", ["power", "evd"])
+def test_get_rtf(ch, mode):
+    if not is_torch_1_9_plus and mode == "evd":
+        # torch 1.9.0+ is required for "evd" mode
+        return
+    if mode == "evd":
+        complex_wrapper = torch.complex
+        complex_module = torch
+    else:
+        complex_wrapper = ComplexTensor
+        complex_module = FC
     stft = Stft(
         n_fft=8,
         win_length=None,
@@ -68,16 +83,20 @@ def test_get_rtf(ch):
     )
     torch.random.manual_seed(0)
     x = random_speech[..., :ch]
-    n = torch.rand(2, 16, ch, dtype=torch.double)
     ilens = torch.LongTensor([16, 12])
     # (B, T, C, F) -> (B, F, C, T)
-    X = ComplexTensor(*torch.unbind(stft(x, ilens)[0], dim=-1)).transpose(-1, -3)
-    N = ComplexTensor(*torch.unbind(stft(n, ilens)[0], dim=-1)).transpose(-1, -3)
+    X = complex_wrapper(*torch.unbind(stft(x, ilens)[0], dim=-1)).transpose(-1, -3)
     # (B, F, C, C)
-    Phi_X = FC.einsum("...ct,...et->...ce", [X, X.conj()])
-    Phi_N = FC.einsum("...ct,...et->...ce", [N, N.conj()])
+    Phi_X = complex_module.einsum("...ct,...et->...ce", [X, X.conj()])
+
+    is_singular = True
+    while is_singular:
+        N = complex_wrapper(torch.randn_like(X.real), torch.randn_like(X.imag))
+        Phi_N = complex_module.einsum("...ct,...et->...ce", [N, N.conj()])
+        is_singular = not np.all(np.linalg.matrix_rank(Phi_N.numpy()) == ch)
+
     # (B, F, C, 1)
-    rtf = get_rtf(Phi_X, Phi_N, reference_vector=0, iterations=20)
+    rtf = get_rtf(Phi_X, Phi_N, mode=mode, reference_vector=0, iterations=20)
     if is_torch_1_1_plus:
         rtf = rtf / (rtf.abs().max(dim=-2, keepdim=True).values + 1e-15)
     else:
@@ -85,15 +104,15 @@ def test_get_rtf(ch):
     # rtf \approx Phi_N MaxEigVec(Phi_N^-1 @ Phi_X)
     if is_torch_1_1_plus:
         # torch.solve is required, which is only available after pytorch 1.1.0+
-        mat = FC.solve(Phi_X, Phi_N)[0]
-        max_eigenvec = FC.solve(rtf, Phi_N)[0]
+        mat = solve(Phi_X, Phi_N)[0]
+        max_eigenvec = solve(rtf, Phi_N)[0]
     else:
-        mat = FC.matmul(Phi_N.inverse2(), Phi_X)
-        max_eigenvec = FC.matmul(Phi_N.inverse2(), rtf)
-    factor = FC.matmul(mat, max_eigenvec)
-    assert FC.allclose(
-        FC.matmul(max_eigenvec, factor.transpose(-1, -2)),
-        FC.matmul(factor, max_eigenvec.transpose(-1, -2)),
+        mat = complex_module.matmul(Phi_N.inverse2(), Phi_X)
+        max_eigenvec = complex_module.matmul(Phi_N.inverse2(), rtf)
+    factor = complex_module.matmul(mat, max_eigenvec)
+    assert complex_module.allclose(
+        complex_module.matmul(max_eigenvec, factor.transpose(-1, -2)),
+        complex_module.matmul(factor, max_eigenvec.transpose(-1, -2)),
     )
 
 
@@ -117,3 +136,49 @@ def test_signal_framing():
     X2 = signal_framing(X, taps + 1, 1, delay, do_padding=True)
     assert X2.shape == torch.Size([2, 10, 6, 20, taps + 1])
     assert FC.allclose(X2[..., -1], X)
+
+
+@pytest.mark.skipif(not is_torch_1_9_plus, reason="Require torch 1.9.0+")
+@pytest.mark.parametrize("ch", [2, 4, 6, 8])
+def test_gevd(ch):
+    stft = Stft(
+        n_fft=8,
+        win_length=None,
+        hop_length=2,
+        center=True,
+        window="hann",
+        normalized=False,
+        onesided=True,
+    )
+    torch.random.manual_seed(0)
+    x = random_speech[..., :ch]
+    ilens = torch.LongTensor([16, 12])
+    # (B, T, C, F) -> (B, F, C, T)
+    X = torch.complex(*torch.unbind(stft(x, ilens)[0], dim=-1)).transpose(-1, -3)
+    # (B, F, C, C)
+    Phi_X = torch.einsum("...ct,...et->...ce", [X, X.conj()])
+
+    is_singular = True
+    while is_singular:
+        N = torch.randn_like(X)
+        Phi_N = torch.einsum("...ct,...et->...ce", [N, N.conj()])
+        is_singular = not torch.linalg.matrix_rank(Phi_N).eq(ch).all()
+    # Phi_N = torch.eye(ch, dtype=Phi_X.dtype).view(1, 1, ch, ch).expand_as(Phi_X)
+
+    # e_val: (B, F, C)
+    # e_vec: (B, F, C, C)
+    e_val, e_vec = generalized_eigenvalue_decomposition(Phi_X, Phi_N)
+    e_val = e_val.to(dtype=e_vec.dtype)
+    assert torch.allclose(
+        torch.matmul(Phi_X, e_vec),
+        torch.matmul(torch.matmul(Phi_N, e_vec), e_val.diag_embed()),
+    )
+
+
+@pytest.mark.skipif(not is_torch_1_9_plus, reason="Require torch 1.9.0+")
+def test_gev_phase_correction():
+    mat = ComplexTensor(torch.rand(2, 3, 4), torch.rand(2, 3, 4))
+    mat_th = torch.complex(mat.real, mat.imag)
+    norm = gev_phase_correction(mat)
+    norm_th = gev_phase_correction(mat_th)
+    assert np.allclose(norm.numpy(), norm_th.numpy())
diff --git a/test/espnet2/enh/loss/criterions/test_tf_domain.py b/test/espnet2/enh/loss/criterions/test_tf_domain.py
index a82cd2aed70..9d1cec94a1d 100644
--- a/test/espnet2/enh/loss/criterions/test_tf_domain.py
+++ b/test/espnet2/enh/loss/criterions/test_tf_domain.py
@@ -1,28 +1,115 @@
+from distutils.version import LooseVersion
 import pytest
 import torch
 
 from torch_complex import ComplexTensor
 
+from espnet2.enh.loss.criterions.tf_domain import FrequencyDomainAbsCoherence
+from espnet2.enh.loss.criterions.tf_domain import FrequencyDomainCrossEntropy
+from espnet2.enh.loss.criterions.tf_domain import FrequencyDomainDPCL
 from espnet2.enh.loss.criterions.tf_domain import FrequencyDomainL1
 from espnet2.enh.loss.criterions.tf_domain import FrequencyDomainMSE
 
 
+is_torch_1_9_plus = LooseVersion(torch.__version__) >= LooseVersion("1.9.0")
+
+
 @pytest.mark.parametrize("criterion_class", [FrequencyDomainL1, FrequencyDomainMSE])
-@pytest.mark.parametrize("mask_type", ["IBM", "IRM", "IAM", "PSM", "NPSM", "PSM^2"])
+@pytest.mark.parametrize(
+    "mask_type", ["IBM", "IRM", "IAM", "PSM", "NPSM", "PSM^2", "CIRM"]
+)
 @pytest.mark.parametrize("compute_on_mask", [True, False])
-def test_tf_domain_criterion_forward(criterion_class, mask_type, compute_on_mask):
+@pytest.mark.parametrize("input_ch", [1, 2])
+def test_tf_domain_criterion_forward(
+    criterion_class, mask_type, compute_on_mask, input_ch
+):
 
     criterion = criterion_class(compute_on_mask=compute_on_mask, mask_type=mask_type)
+    complex_wrapper = torch.complex if is_torch_1_9_plus else ComplexTensor
 
     batch = 2
-    inf = [torch.rand(batch, 10, 200)]
-    ref_spec = [ComplexTensor(torch.rand(batch, 10, 200), torch.rand(batch, 10, 200))]
-    mix_spec = ComplexTensor(torch.rand(batch, 10, 200), torch.rand(batch, 10, 200))
+    shape = (batch, 10, 200) if input_ch == 1 else (batch, 10, input_ch, 200)
+    ref_spec = [complex_wrapper(torch.rand(*shape), torch.rand(*shape))]
+    mix_spec = complex_wrapper(torch.rand(*shape), torch.rand(*shape))
+    noise_spec = complex_wrapper(torch.rand(*shape), torch.rand(*shape))
 
     if compute_on_mask:
-        ref = criterion.create_mask_label(mix_spec, ref_spec)
+        inf = [torch.rand(*shape)]
+        ref = criterion.create_mask_label(mix_spec, ref_spec, noise_spec=noise_spec)
+        loss = criterion(ref[0], inf[0])
     else:
-        ref = [abs(r) for r in ref_spec]
+        inf_spec = [complex_wrapper(torch.rand(*shape), torch.rand(*shape))]
+        loss = criterion(ref_spec[0], inf_spec[0])
+
+    assert loss.shape == (batch,), "Invlid loss shape with " + criterion.name
+
+
+@pytest.mark.parametrize("input_ch", [1, 2])
+def test_tf_coh_criterion_forward(input_ch):
+
+    criterion = FrequencyDomainAbsCoherence()
+    complex_wrapper = torch.complex if is_torch_1_9_plus else ComplexTensor
+
+    batch = 2
+    shape = (batch, 10, 200) if input_ch == 1 else (batch, 10, input_ch, 200)
+    inf_spec = complex_wrapper(torch.rand(*shape), torch.rand(*shape))
+    ref_spec = complex_wrapper(torch.rand(*shape), torch.rand(*shape))
+
+    loss = criterion(ref_spec, inf_spec)
+    assert loss.shape == (batch,), "Invlid loss shape with " + criterion.name
+
+
+@pytest.mark.parametrize("input_ch", [1, 2])
+def test_tf_coh_criterion_invalid_forward(input_ch):
+
+    criterion = FrequencyDomainAbsCoherence()
+    complex_wrapper = torch.complex if is_torch_1_9_plus else ComplexTensor
+
+    batch = 2
+    shape = (batch, 10, 200) if input_ch == 1 else (batch, 10, input_ch, 200)
+    inf_spec = complex_wrapper(torch.rand(*shape), torch.rand(*shape))
+    ref_spec = complex_wrapper(torch.rand(*shape), torch.rand(*shape))
+
+    with pytest.raises(ValueError):
+        criterion(ref_spec.real, inf_spec)
+
+    with pytest.raises(ValueError):
+        if input_ch == 1:
+            criterion(ref_spec[0], inf_spec[0])
+        else:
+            criterion(ref_spec[0, 0], inf_spec[0, 0])
+
+
+@pytest.mark.parametrize("input_ch", [1, 2])
+def test_tf_ce_criterion_forward(input_ch):
+
+    criterion = FrequencyDomainCrossEntropy()
+
+    batch = 2
+    ncls = 200
+    shape = (batch, 10, ncls) if input_ch == 1 else (batch, 10, input_ch, ncls)
+    label_shape = (batch, 10) if input_ch == 1 else (batch, 10, input_ch)
+    inf_spec = torch.rand(*shape)
+    ref_spec = torch.randint(0, ncls, label_shape)
+
+    loss = criterion(ref_spec, inf_spec)
+    assert loss.shape == (batch,), "Invlid loss shape with " + criterion.name
+
+
+@pytest.mark.parametrize("loss_type", ["dpcl", "mdc"])
+def test_tf_dpcl_loss_criterion_forward(loss_type):
+
+    criterion = FrequencyDomainDPCL(loss_type=loss_type)
+
+    batch = 2
+    inf = torch.rand(batch, 10 * 200, 40)
+    ref_spec = [
+        ComplexTensor(torch.rand(batch, 10, 200), torch.rand(batch, 10, 200)),
+        ComplexTensor(torch.rand(batch, 10, 200), torch.rand(batch, 10, 200)),
+        ComplexTensor(torch.rand(batch, 10, 200), torch.rand(batch, 10, 200)),
+    ]
+
+    ref = [abs(r) for r in ref_spec]
 
-    loss = criterion(ref[0], inf[0])
-    assert loss.shape == (batch,)
+    loss = criterion(ref, inf)
+    assert loss.shape == (batch,), "Invlid loss shape with " + criterion.name
diff --git a/test/espnet2/enh/loss/criterions/test_time_domain.py b/test/espnet2/enh/loss/criterions/test_time_domain.py
index 250aa604cd0..208b23ab85f 100644
--- a/test/espnet2/enh/loss/criterions/test_time_domain.py
+++ b/test/espnet2/enh/loss/criterions/test_time_domain.py
@@ -2,11 +2,14 @@
 import torch
 
 from espnet2.enh.loss.criterions.time_domain import CISDRLoss
+from espnet2.enh.loss.criterions.time_domain import SDRLoss
 from espnet2.enh.loss.criterions.time_domain import SISNRLoss
 from espnet2.enh.loss.criterions.time_domain import SNRLoss
+from espnet2.enh.loss.criterions.time_domain import TimeDomainL1
+from espnet2.enh.loss.criterions.time_domain import TimeDomainMSE
 
 
-@pytest.mark.parametrize("criterion_class", [CISDRLoss, SISNRLoss, SNRLoss])
+@pytest.mark.parametrize("criterion_class", [CISDRLoss, SISNRLoss, SNRLoss, SDRLoss])
 def test_tf_domain_criterion_forward(criterion_class):
 
     criterion = criterion_class()
@@ -16,4 +19,25 @@ def test_tf_domain_criterion_forward(criterion_class):
     ref = torch.rand(batch, 2000)
 
     loss = criterion(ref, inf)
-    assert loss.shape == (batch,)
+    assert loss.shape == (batch,), "Invlid loss shape with " + criterion.name
+
+
+@pytest.mark.parametrize("criterion_class", [TimeDomainL1, TimeDomainMSE])
+@pytest.mark.parametrize("input_ch", [1, 2])
+def test_tf_domain_l1_l2_forward(criterion_class, input_ch):
+
+    criterion = criterion_class()
+
+    batch = 2
+    shape = (batch, 200) if input_ch == 1 else (batch, 200, input_ch)
+    inf = torch.rand(*shape)
+    ref = torch.rand(*shape)
+
+    loss = criterion(ref, inf)
+    assert loss.shape == (batch,), "Invlid loss shape with " + criterion.name
+
+    with pytest.raises(ValueError):
+        if input_ch == 1:
+            loss = criterion(ref[..., None, None], inf[..., None, None])
+        else:
+            loss = criterion(ref[..., None], inf[..., None])
diff --git a/test/espnet2/enh/loss/wrappers/test_dpcl_solver.py b/test/espnet2/enh/loss/wrappers/test_dpcl_solver.py
new file mode 100644
index 00000000000..b5c14c78c3e
--- /dev/null
+++ b/test/espnet2/enh/loss/wrappers/test_dpcl_solver.py
@@ -0,0 +1,17 @@
+import pytest
+import torch
+
+from espnet2.enh.loss.criterions.tf_domain import FrequencyDomainDPCL
+from espnet2.enh.loss.wrappers.dpcl_solver import DPCLSolver
+
+
+@pytest.mark.parametrize("num_spk", [1, 2, 3])
+def test_DPCLSolver_forward(num_spk):
+
+    batch = 2
+    o = {"tf_embedding": torch.rand(batch, 10 * 200, 40)}
+    inf = [torch.rand(batch, 10, 200) for spk in range(num_spk)]
+    ref = [inf[num_spk - spk - 1] for spk in range(num_spk)]  # reverse inf as ref
+    solver = DPCLSolver(FrequencyDomainDPCL())
+
+    loss, stats, others = solver(ref, inf, o)
diff --git a/test/espnet2/enh/loss/wrappers/test_multilayer_pit_solver.py b/test/espnet2/enh/loss/wrappers/test_multilayer_pit_solver.py
new file mode 100644
index 00000000000..3505a007eee
--- /dev/null
+++ b/test/espnet2/enh/loss/wrappers/test_multilayer_pit_solver.py
@@ -0,0 +1,52 @@
+import pytest
+
+import torch
+
+from espnet2.enh.loss.criterions.tf_domain import FrequencyDomainL1
+from espnet2.enh.loss.wrappers.multilayer_pit_solver import MultiLayerPITSolver
+
+
+@pytest.mark.parametrize("num_spk", [1, 2, 3])
+def test_MultiLayerPITSolver_forward_multi_layer(num_spk):
+
+    batch = 2
+    num_layers = 2
+    # infs is a List of List (num_layer x num_speaker Tensors)
+    infs = [
+        [torch.rand(batch, 10, 100) for spk in range(num_spk)]
+        for _ in range(num_layers)
+    ]
+    ref = [infs[-1][num_spk - spk - 1] for spk in range(num_spk)]  # reverse inf as ref
+    solver = MultiLayerPITSolver(FrequencyDomainL1(), independent_perm=True)
+
+    loss, stats, others = solver(ref, infs)
+    perm = others["perm"]
+    correct_perm = list(range(num_spk))
+    correct_perm.reverse()
+    assert perm[0].equal(torch.tensor(correct_perm))
+
+    # test for independent_perm is False
+
+    solver = MultiLayerPITSolver(FrequencyDomainL1(), independent_perm=False)
+    loss, stats, others = solver(ref, infs, {"perm": perm})
+
+
+@pytest.mark.parametrize("num_spk", [1, 2, 3])
+def test_MultiLayerPITSolver_forward_single_layer(num_spk):
+
+    batch = 2
+    # inf is a List of Tensors
+    inf = [torch.rand(batch, 10, 100) for spk in range(num_spk)]
+    ref = [inf[num_spk - spk - 1] for spk in range(num_spk)]  # reverse inf as ref
+    solver = MultiLayerPITSolver(FrequencyDomainL1(), independent_perm=True)
+
+    loss, stats, others = solver(ref, inf)
+    perm = others["perm"]
+    correct_perm = list(range(num_spk))
+    correct_perm.reverse()
+    assert perm[0].equal(torch.tensor(correct_perm))
+
+    # test for independent_perm is False
+
+    solver = MultiLayerPITSolver(FrequencyDomainL1(), independent_perm=False)
+    loss, stats, others = solver(ref, inf, {"perm": perm})
diff --git a/test/espnet2/enh/loss/wrappers/test_pit_solver.py b/test/espnet2/enh/loss/wrappers/test_pit_solver.py
index 40248a6b6bd..ddba099e17e 100644
--- a/test/espnet2/enh/loss/wrappers/test_pit_solver.py
+++ b/test/espnet2/enh/loss/wrappers/test_pit_solver.py
@@ -1,6 +1,8 @@
 import pytest
 import torch
+import torch.nn.functional as F
 
+from espnet2.enh.loss.criterions.tf_domain import FrequencyDomainCrossEntropy
 from espnet2.enh.loss.criterions.tf_domain import FrequencyDomainL1
 from espnet2.enh.loss.wrappers.pit_solver import PITSolver
 
@@ -23,3 +25,25 @@ def test_PITSolver_forward(num_spk):
 
     solver = PITSolver(FrequencyDomainL1(), independent_perm=False)
     loss, stats, others = solver(ref, inf, {"perm": perm})
+
+
+@pytest.mark.parametrize("num_spk", [1, 2, 3])
+def test_PITSolver_tf_ce_forward(num_spk):
+
+    batch = 2
+    ncls = 100
+    ref = [torch.randint(0, ncls, (batch, 10)) for spk in range(num_spk)]
+    bias = [F.one_hot(y) for y in ref]
+    bias = [F.pad(y, (0, ncls - y.size(-1))) for y in bias]
+    inf = [torch.rand(batch, 10, ncls) + bias[spk] for spk in range(num_spk)]
+    solver = PITSolver(FrequencyDomainCrossEntropy(), independent_perm=True)
+
+    loss, stats, others = solver(ref, inf)
+    perm = others["perm"]
+    correct_perm = list(range(num_spk))
+    assert perm[0].equal(torch.tensor(correct_perm)), (perm, correct_perm)
+
+    # test for independent_perm is False
+
+    solver = PITSolver(FrequencyDomainCrossEntropy(), independent_perm=False)
+    loss, stats, others = solver(ref, inf, {"perm": perm})
diff --git a/test/espnet2/enh/separator/test_beamformer.py b/test/espnet2/enh/separator/test_beamformer.py
index 72f5b7a6dea..3a10c7a9643 100644
--- a/test/espnet2/enh/separator/test_beamformer.py
+++ b/test/espnet2/enh/separator/test_beamformer.py
@@ -3,6 +3,7 @@
 import torch
 
 from espnet2.enh.encoder.stft_encoder import STFTEncoder
+from espnet2.enh.layers.dnn_beamformer import BEAMFORMER_TYPES
 from espnet2.enh.separator.neural_beamformer import NeuralBeamformer
 
 
@@ -74,19 +75,7 @@
 @pytest.mark.parametrize("ref_channel", [-1, 0])
 @pytest.mark.parametrize("use_noise_mask", [True])
 @pytest.mark.parametrize("bnonlinear", ["sigmoid", "relu", "tanh", "crelu"])
-@pytest.mark.parametrize(
-    "beamformer_type",
-    [
-        "mvdr_souden",
-        "mpdr_souden",
-        "wmpdr_souden",
-        "wpd_souden",
-        "mvdr",
-        "mpdr",
-        "wmpdr",
-        "wpd",
-    ],
-)
+@pytest.mark.parametrize("beamformer_type", BEAMFORMER_TYPES)
 def test_neural_beamformer_forward_backward(
     n_fft,
     win_length,
@@ -118,9 +107,19 @@ def test_neural_beamformer_forward_backward(
         if not multi_source_wpe:
             # Single-source WPE is not supported with beamformer in multi-speaker cases
             return
-    elif num_spk == 1 and multi_source_wpe:
-        # When num_spk == 1, `multi_source_wpe` has no effect
-        return
+    elif num_spk == 1:
+        if multi_source_wpe:
+            # When num_spk == 1, `multi_source_wpe` has no effect
+            return
+        elif beamformer_type in (
+            "lcmv",
+            "lcmp",
+            "wlcmp",
+            "mvdr_tfs",
+            "mvdr_tfs_souden",
+        ):
+            # only support multiple-source cases
+            return
     if bnonlinear != "sigmoid" and (
         beamformer_type != "mvdr_souden" or multi_source_wpe
     ):
@@ -214,19 +213,7 @@ def test_neural_beamformer_wpe_output(
 
 @pytest.mark.parametrize("num_spk", [1, 2])
 @pytest.mark.parametrize("use_noise_mask", [True, False])
-@pytest.mark.parametrize(
-    "beamformer_type",
-    [
-        "mvdr_souden",
-        "mpdr_souden",
-        "wmpdr_souden",
-        "wpd_souden",
-        "mvdr",
-        "mpdr",
-        "wmpdr",
-        "wpd",
-    ],
-)
+@pytest.mark.parametrize("beamformer_type", BEAMFORMER_TYPES)
 @pytest.mark.parametrize(
     "diagonal_loading, mask_flooring, use_torch_solver",
     [(True, True, True), (False, False, False)],
@@ -239,6 +226,16 @@ def test_neural_beamformer_bf_output(
     mask_flooring,
     use_torch_solver,
 ):
+    if num_spk == 1 and beamformer_type in (
+        "lcmv",
+        "lcmp",
+        "wlcmp",
+        "mvdr_tfs",
+        "mvdr_tfs_souden",
+    ):
+        # only support multiple-source cases
+        return
+
     ch = 2
     inputs = random_speech[..., :ch].float()
     ilens = torch.LongTensor([16, 12])
diff --git a/test/espnet2/enh/separator/test_dan_separator.py b/test/espnet2/enh/separator/test_dan_separator.py
new file mode 100644
index 00000000000..675176bea8a
--- /dev/null
+++ b/test/espnet2/enh/separator/test_dan_separator.py
@@ -0,0 +1,130 @@
+import pytest
+
+import torch
+from torch import Tensor
+from torch_complex import ComplexTensor
+
+from espnet2.enh.separator.dan_separator import DANSeparator
+
+
+@pytest.mark.parametrize("input_dim", [5])
+@pytest.mark.parametrize("rnn_type", ["blstm"])
+@pytest.mark.parametrize("layer", [1, 3])
+@pytest.mark.parametrize("unit", [8])
+@pytest.mark.parametrize("dropout", [0.0, 0.2])
+@pytest.mark.parametrize("num_spk", [2])
+@pytest.mark.parametrize("emb_D", [40])
+@pytest.mark.parametrize("nonlinear", ["relu", "sigmoid", "tanh"])
+def test_dan_separator_forward_backward_complex(
+    input_dim, rnn_type, layer, unit, dropout, num_spk, emb_D, nonlinear
+):
+    model = DANSeparator(
+        input_dim=input_dim,
+        rnn_type=rnn_type,
+        layer=layer,
+        unit=unit,
+        dropout=dropout,
+        num_spk=num_spk,
+        emb_D=emb_D,
+        nonlinear=nonlinear,
+    )
+    model.train()
+
+    real = torch.rand(2, 10, input_dim)
+    imag = torch.rand(2, 10, input_dim)
+    x = ComplexTensor(real, imag)
+    x_lens = torch.tensor([10, 8], dtype=torch.long)
+
+    o = []
+    for i in range(num_spk):
+        o.append(ComplexTensor(real, imag))
+
+    sep_others = {}
+    sep_others["feature_ref"] = o
+
+    masked, flens, others = model(x, ilens=x_lens, additional=sep_others)
+
+    assert isinstance(masked[0], ComplexTensor)
+    assert len(masked) == num_spk
+
+    masked[0].abs().mean().backward()
+
+
+@pytest.mark.parametrize("input_dim", [5])
+@pytest.mark.parametrize("rnn_type", ["blstm"])
+@pytest.mark.parametrize("layer", [1, 3])
+@pytest.mark.parametrize("unit", [8])
+@pytest.mark.parametrize("dropout", [0.0, 0.2])
+@pytest.mark.parametrize("num_spk", [1, 2])
+@pytest.mark.parametrize("emb_D", [40])
+@pytest.mark.parametrize("nonlinear", ["relu", "sigmoid", "tanh"])
+def test_dan_separator_forward_backward_real(
+    input_dim, rnn_type, layer, unit, dropout, num_spk, emb_D, nonlinear
+):
+    model = DANSeparator(
+        input_dim=input_dim,
+        rnn_type=rnn_type,
+        layer=layer,
+        unit=unit,
+        dropout=dropout,
+        num_spk=num_spk,
+        emb_D=emb_D,
+        nonlinear=nonlinear,
+    )
+    model.train()
+
+    x = torch.rand(2, 10, input_dim)
+    x_lens = torch.tensor([10, 8], dtype=torch.long)
+
+    o = []
+    for i in range(num_spk):
+        o.append(ComplexTensor(x, x))
+
+    sep_others = {}
+    sep_others["feature_ref"] = o
+
+    masked, flens, others = model(x, ilens=x_lens, additional=sep_others)
+
+    assert isinstance(masked[0], Tensor)
+    assert len(masked) == num_spk
+
+    masked[0].abs().mean().backward()
+
+
+def test_dan_separator_invalid_type():
+    with pytest.raises(ValueError):
+        DANSeparator(
+            input_dim=10,
+            rnn_type="rnn",
+            layer=2,
+            unit=10,
+            dropout=0.1,
+            num_spk=2,
+            emb_D=40,
+            nonlinear="fff",
+        )
+
+
+def test_dan_separator_output():
+
+    x = torch.rand(1, 10, 10)
+    x_lens = torch.tensor([10], dtype=torch.long)
+
+    for num_spk in range(1, 4):
+        model = DANSeparator(
+            input_dim=10,
+            rnn_type="rnn",
+            layer=2,
+            unit=10,
+            dropout=0.1,
+            num_spk=num_spk,
+            emb_D=40,
+            nonlinear="relu",
+        )
+        model.eval()
+        specs, _, others = model(x, x_lens)
+        assert isinstance(specs, list)
+        assert isinstance(others, dict)
+        for n in range(num_spk):
+            assert "mask_spk{}".format(n + 1) in others
+            assert specs[n].shape == others["mask_spk{}".format(n + 1)].shape
diff --git a/test/espnet2/enh/separator/test_dc_crn_separator.py b/test/espnet2/enh/separator/test_dc_crn_separator.py
new file mode 100644
index 00000000000..712de05e063
--- /dev/null
+++ b/test/espnet2/enh/separator/test_dc_crn_separator.py
@@ -0,0 +1,164 @@
+from distutils.version import LooseVersion
+import pytest
+
+import torch
+from torch_complex import ComplexTensor
+
+from espnet2.enh.layers.complex_utils import is_complex
+from espnet2.enh.separator.dc_crn_separator import DC_CRNSeparator
+
+
+is_torch_1_9_plus = LooseVersion(torch.__version__) >= LooseVersion("1.9.0")
+
+
+@pytest.mark.parametrize("input_dim", [33, 65])
+@pytest.mark.parametrize("num_spk", [1, 2])
+@pytest.mark.parametrize("input_channels", [[2, 4], [2, 4, 4]])
+@pytest.mark.parametrize("enc_hid_channels", [2, 5])
+@pytest.mark.parametrize("enc_layers", [2])
+@pytest.mark.parametrize("glstm_groups", [2])
+@pytest.mark.parametrize("glstm_layers", [1, 2])
+@pytest.mark.parametrize("glstm_bidirectional", [True, False])
+@pytest.mark.parametrize("glstm_rearrange", [True, False])
+@pytest.mark.parametrize("mode", ["mapping", "masking"])
+def test_dc_crn_separator_forward_backward_complex(
+    input_dim,
+    num_spk,
+    input_channels,
+    enc_hid_channels,
+    enc_layers,
+    glstm_groups,
+    glstm_layers,
+    glstm_bidirectional,
+    glstm_rearrange,
+    mode,
+):
+    model = DC_CRNSeparator(
+        input_dim=input_dim,
+        num_spk=num_spk,
+        input_channels=input_channels,
+        enc_hid_channels=enc_hid_channels,
+        enc_kernel_size=(1, 3),
+        enc_padding=(0, 1),
+        enc_last_kernel_size=(1, 3),
+        enc_last_stride=(1, 2),
+        enc_last_padding=(0, 1),
+        enc_layers=enc_layers,
+        skip_last_kernel_size=(1, 3),
+        skip_last_stride=(1, 1),
+        skip_last_padding=(0, 1),
+        glstm_groups=glstm_groups,
+        glstm_layers=glstm_layers,
+        glstm_bidirectional=glstm_bidirectional,
+        glstm_rearrange=glstm_rearrange,
+        mode=mode,
+    )
+    model.train()
+
+    real = torch.rand(2, 10, input_dim)
+    imag = torch.rand(2, 10, input_dim)
+    x = torch.complex(real, imag) if is_torch_1_9_plus else ComplexTensor(real, imag)
+    x_lens = torch.tensor([10, 8], dtype=torch.long)
+
+    masked, flens, others = model(x, ilens=x_lens)
+
+    assert is_complex(masked[0])
+    assert len(masked) == num_spk
+
+    masked[0].abs().mean().backward()
+
+
+@pytest.mark.parametrize("num_spk", [1, 2])
+@pytest.mark.parametrize("input_channels", [[4, 4], [6, 4, 4]])
+@pytest.mark.parametrize(
+    "enc_kernel_size, enc_padding", [((1, 3), (0, 1)), ((1, 5), (0, 2))]
+)
+@pytest.mark.parametrize("enc_last_stride", [(1, 2)])
+@pytest.mark.parametrize(
+    "enc_last_kernel_size, enc_last_padding",
+    [((1, 4), (0, 1)), ((1, 5), (0, 2))],
+)
+@pytest.mark.parametrize("skip_last_stride", [(1, 1)])
+@pytest.mark.parametrize(
+    "skip_last_kernel_size, skip_last_padding",
+    [((1, 3), (0, 1)), ((1, 5), (0, 2))],
+)
+def test_dc_crn_separator_multich_input(
+    num_spk,
+    input_channels,
+    enc_kernel_size,
+    enc_padding,
+    enc_last_kernel_size,
+    enc_last_stride,
+    enc_last_padding,
+    skip_last_kernel_size,
+    skip_last_stride,
+    skip_last_padding,
+):
+    model = DC_CRNSeparator(
+        input_dim=33,
+        num_spk=num_spk,
+        input_channels=input_channels,
+        enc_hid_channels=2,
+        enc_kernel_size=enc_kernel_size,
+        enc_padding=enc_padding,
+        enc_last_kernel_size=enc_last_kernel_size,
+        enc_last_stride=enc_last_stride,
+        enc_last_padding=enc_last_padding,
+        enc_layers=3,
+        skip_last_kernel_size=skip_last_kernel_size,
+        skip_last_stride=skip_last_stride,
+        skip_last_padding=skip_last_padding,
+    )
+    model.train()
+
+    real = torch.rand(2, 10, input_channels[0] // 2, 33)
+    imag = torch.rand(2, 10, input_channels[0] // 2, 33)
+    x = torch.complex(real, imag) if is_torch_1_9_plus else ComplexTensor(real, imag)
+    x_lens = torch.tensor([10, 8], dtype=torch.long)
+
+    masked, flens, others = model(x, ilens=x_lens)
+
+    assert is_complex(masked[0])
+    assert len(masked) == num_spk
+
+    masked[0].abs().mean().backward()
+
+
+def test_dc_crn_separator_invalid_enc_layer():
+    with pytest.raises(AssertionError):
+        DC_CRNSeparator(
+            input_dim=17,
+            input_channels=[2, 2, 4],
+            enc_layers=1,
+        )
+
+
+def test_dc_crn_separator_invalid_type():
+    with pytest.raises(ValueError):
+        DC_CRNSeparator(
+            input_dim=17,
+            input_channels=[2, 2, 4],
+            mode="xxx",
+        )
+
+
+def test_dc_crn_separator_output():
+    real = torch.rand(2, 10, 17)
+    imag = torch.rand(2, 10, 17)
+    x = torch.complex(real, imag) if is_torch_1_9_plus else ComplexTensor(real, imag)
+    x_lens = torch.tensor([10, 8], dtype=torch.long)
+
+    for num_spk in range(1, 3):
+        model = DC_CRNSeparator(
+            input_dim=17,
+            num_spk=num_spk,
+            input_channels=[2, 2, 4],
+        )
+        model.eval()
+        specs, _, others = model(x, x_lens)
+        assert isinstance(specs, list)
+        assert isinstance(others, dict)
+        for n in range(num_spk):
+            assert "mask_spk{}".format(n + 1) in others
+            assert specs[n].shape == others["mask_spk{}".format(n + 1)].shape
diff --git a/test/espnet2/enh/separator/test_dccrn_separator.py b/test/espnet2/enh/separator/test_dccrn_separator.py
new file mode 100644
index 00000000000..acf30c1ed98
--- /dev/null
+++ b/test/espnet2/enh/separator/test_dccrn_separator.py
@@ -0,0 +1,102 @@
+from distutils.version import LooseVersion
+import pytest
+
+import torch
+from torch_complex import ComplexTensor
+
+from espnet2.enh.separator.dccrn_separator import DCCRNSeparator
+
+is_torch_1_9_plus = LooseVersion(torch.__version__) >= LooseVersion("1.9.0")
+
+
+@pytest.mark.parametrize("input_dim", [9])
+@pytest.mark.parametrize("num_spk", [1, 2])
+@pytest.mark.parametrize("rnn_layer", [2, 3])
+@pytest.mark.parametrize("rnn_units", [256])
+@pytest.mark.parametrize("masking_mode", ["E", "C", "R"])
+@pytest.mark.parametrize("use_clstm", [True, False])
+@pytest.mark.parametrize("bidirectional", [True, False])
+@pytest.mark.parametrize("use_cbn", [True, False])
+@pytest.mark.parametrize("kernel_size", [5])
+@pytest.mark.parametrize("use_builtin_complex", [True, False])
+@pytest.mark.parametrize("use_noise_mask", [True, False])
+def test_dccrn_separator_forward_backward_complex(
+    input_dim,
+    num_spk,
+    rnn_layer,
+    rnn_units,
+    masking_mode,
+    use_clstm,
+    bidirectional,
+    use_cbn,
+    kernel_size,
+    use_builtin_complex,
+    use_noise_mask,
+):
+    model = DCCRNSeparator(
+        input_dim=input_dim,
+        num_spk=num_spk,
+        rnn_layer=rnn_layer,
+        rnn_units=rnn_units,
+        masking_mode=masking_mode,
+        use_clstm=use_clstm,
+        bidirectional=bidirectional,
+        use_cbn=use_cbn,
+        kernel_size=kernel_size,
+        kernel_num=[
+            32,
+            64,
+            128,
+        ],
+        use_builtin_complex=use_builtin_complex,
+        use_noise_mask=use_noise_mask,
+    )
+    model.train()
+
+    real = torch.rand(2, 10, input_dim)
+    imag = torch.rand(2, 10, input_dim)
+    x = ComplexTensor(real, imag)
+    x_lens = torch.tensor([10, 8], dtype=torch.long)
+
+    masked, flens, others = model(x, ilens=x_lens)
+
+    if use_builtin_complex and is_torch_1_9_plus:
+        assert isinstance(masked[0], torch.Tensor)
+    else:
+        assert isinstance(masked[0], ComplexTensor)
+    assert len(masked) == num_spk
+
+    masked[0].abs().mean().backward()
+
+
+def test_dccrn_separator_invalid_type():
+    with pytest.raises(ValueError):
+        DCCRNSeparator(
+            input_dim=10,
+            masking_mode="fff",
+        )
+
+
+def test_rnn_separator_output():
+    real = torch.rand(2, 10, 9)
+    imag = torch.rand(2, 10, 9)
+    x = ComplexTensor(real, imag)
+    x_lens = torch.tensor([10, 8], dtype=torch.long)
+
+    for num_spk in range(1, 3):
+        model = DCCRNSeparator(
+            input_dim=9,
+            num_spk=num_spk,
+            kernel_num=[
+                32,
+                64,
+                128,
+            ],
+        )
+        model.eval()
+        specs, _, others = model(x, x_lens)
+        assert isinstance(specs, list)
+        assert isinstance(others, dict)
+        for n in range(num_spk):
+            assert "mask_spk{}".format(n + 1) in others
+            assert specs[n].shape == others["mask_spk{}".format(n + 1)].shape
diff --git a/test/espnet2/enh/separator/test_dpcl_e2e_separator.py b/test/espnet2/enh/separator/test_dpcl_e2e_separator.py
new file mode 100644
index 00000000000..c470a9ee83f
--- /dev/null
+++ b/test/espnet2/enh/separator/test_dpcl_e2e_separator.py
@@ -0,0 +1,146 @@
+import pytest
+
+import torch
+from torch import Tensor
+from torch_complex import ComplexTensor
+
+from espnet2.enh.separator.dpcl_e2e_separator import DPCLE2ESeparator
+
+
+@pytest.mark.parametrize("input_dim", [5])
+@pytest.mark.parametrize("rnn_type", ["blstm"])
+@pytest.mark.parametrize("layer", [1, 3])
+@pytest.mark.parametrize("unit", [8])
+@pytest.mark.parametrize("dropout", [0.0, 0.2])
+@pytest.mark.parametrize("num_spk", [2])
+@pytest.mark.parametrize("emb_D", [40])
+@pytest.mark.parametrize("nonlinear", ["relu", "sigmoid", "tanh"])
+@pytest.mark.parametrize("alpha", [1.0, 5.0])
+@pytest.mark.parametrize("max_iteration", [100, 500])
+def test_dpcl_e2e_separator_forward_backward_complex(
+    input_dim,
+    rnn_type,
+    layer,
+    unit,
+    dropout,
+    num_spk,
+    emb_D,
+    nonlinear,
+    alpha,
+    max_iteration,
+):
+    model = DPCLE2ESeparator(
+        input_dim=input_dim,
+        rnn_type=rnn_type,
+        layer=layer,
+        unit=unit,
+        dropout=dropout,
+        num_spk=num_spk,
+        emb_D=emb_D,
+        nonlinear=nonlinear,
+        alpha=alpha,
+        max_iteration=max_iteration,
+    )
+    model.train()
+
+    real = torch.rand(2, 10, input_dim)
+    imag = torch.rand(2, 10, input_dim)
+    x = ComplexTensor(real, imag)
+    x_lens = torch.tensor([10, 8], dtype=torch.long)
+
+    masked, flens, others = model(x, ilens=x_lens)
+
+    assert isinstance(masked[0], ComplexTensor)
+    assert len(masked) == num_spk
+
+    masked[0].abs().mean().backward()
+
+
+@pytest.mark.parametrize("input_dim", [5])
+@pytest.mark.parametrize("rnn_type", ["blstm"])
+@pytest.mark.parametrize("layer", [1, 3])
+@pytest.mark.parametrize("unit", [8])
+@pytest.mark.parametrize("dropout", [0.0, 0.2])
+@pytest.mark.parametrize("num_spk", [2])
+@pytest.mark.parametrize("emb_D", [40])
+@pytest.mark.parametrize("nonlinear", ["relu", "sigmoid", "tanh"])
+@pytest.mark.parametrize("alpha", [1.0, 5.0])
+@pytest.mark.parametrize("max_iteration", [100, 500])
+def test_dpcl_e2e_separator_forward_backward_real(
+    input_dim,
+    rnn_type,
+    layer,
+    unit,
+    dropout,
+    num_spk,
+    emb_D,
+    nonlinear,
+    alpha,
+    max_iteration,
+):
+    model = DPCLE2ESeparator(
+        input_dim=input_dim,
+        rnn_type=rnn_type,
+        layer=layer,
+        unit=unit,
+        dropout=dropout,
+        num_spk=num_spk,
+        emb_D=emb_D,
+        nonlinear=nonlinear,
+        alpha=alpha,
+        max_iteration=max_iteration,
+    )
+    model.train()
+
+    x = torch.rand(2, 10, input_dim)
+    x_lens = torch.tensor([10, 8], dtype=torch.long)
+
+    masked, flens, others = model(x, ilens=x_lens)
+
+    assert isinstance(masked[0], Tensor)
+    assert len(masked) == num_spk
+
+    masked[0].abs().mean().backward()
+
+
+def test_dpcl_e2e_separator_invalid_type():
+    with pytest.raises(ValueError):
+        DPCLE2ESeparator(
+            input_dim=10,
+            rnn_type="rnn",
+            layer=2,
+            unit=10,
+            dropout=0.1,
+            num_spk=2,
+            emb_D=40,
+            nonlinear="fff",
+            alpha=5.0,
+            max_iteration=100,
+        )
+
+
+def test_dpcl_e2e_separator_output():
+
+    x = torch.rand(1, 10, 10)
+    x_lens = torch.tensor([10], dtype=torch.long)
+
+    for num_spk in range(1, 4):
+        model = DPCLE2ESeparator(
+            input_dim=10,
+            rnn_type="rnn",
+            layer=2,
+            unit=10,
+            dropout=0.1,
+            num_spk=num_spk,
+            emb_D=40,
+            nonlinear="relu",
+            alpha=5.0,
+            max_iteration=100,
+        )
+        model.eval()
+        specs, _, others = model(x, x_lens)
+        assert isinstance(specs, list)
+        assert isinstance(others, dict)
+        for n in range(num_spk):
+            assert "mask_spk{}".format(n + 1) in others
+            assert specs[n].shape == others["mask_spk{}".format(n + 1)].shape
diff --git a/test/espnet2/enh/separator/test_dpcl_separator.py b/test/espnet2/enh/separator/test_dpcl_separator.py
new file mode 100644
index 00000000000..19304579af8
--- /dev/null
+++ b/test/espnet2/enh/separator/test_dpcl_separator.py
@@ -0,0 +1,113 @@
+import pytest
+
+import torch
+from torch_complex import ComplexTensor
+
+from espnet2.enh.separator.dpcl_separator import DPCLSeparator
+
+
+@pytest.mark.parametrize("input_dim", [5])
+@pytest.mark.parametrize("rnn_type", ["blstm"])
+@pytest.mark.parametrize("layer", [1, 3])
+@pytest.mark.parametrize("unit", [8])
+@pytest.mark.parametrize("dropout", [0.0, 0.2])
+@pytest.mark.parametrize("num_spk", [2])
+@pytest.mark.parametrize("emb_D", [40])
+@pytest.mark.parametrize("nonlinear", ["relu", "sigmoid", "tanh"])
+def test_dpcl_separator_forward_backward_complex(
+    input_dim, rnn_type, layer, unit, dropout, num_spk, emb_D, nonlinear
+):
+    model = DPCLSeparator(
+        input_dim=input_dim,
+        rnn_type=rnn_type,
+        layer=layer,
+        unit=unit,
+        dropout=dropout,
+        num_spk=num_spk,
+        emb_D=emb_D,
+        nonlinear=nonlinear,
+    )
+    model.train()
+
+    real = torch.rand(2, 10, input_dim)
+    imag = torch.rand(2, 10, input_dim)
+    x = ComplexTensor(real, imag)
+    x_lens = torch.tensor([10, 8], dtype=torch.long)
+
+    masked, flens, others = model(x, ilens=x_lens)
+
+    assert "tf_embedding" in others
+
+    others["tf_embedding"].abs().mean().backward()
+
+
+@pytest.mark.parametrize("input_dim", [5])
+@pytest.mark.parametrize("rnn_type", ["blstm"])
+@pytest.mark.parametrize("layer", [1, 3])
+@pytest.mark.parametrize("unit", [8])
+@pytest.mark.parametrize("dropout", [0.0, 0.2])
+@pytest.mark.parametrize("num_spk", [1, 2])
+@pytest.mark.parametrize("emb_D", [40])
+@pytest.mark.parametrize("nonlinear", ["relu", "sigmoid", "tanh"])
+def test_dpcl_separator_forward_backward_real(
+    input_dim, rnn_type, layer, unit, dropout, num_spk, emb_D, nonlinear
+):
+    model = DPCLSeparator(
+        input_dim=input_dim,
+        rnn_type=rnn_type,
+        layer=layer,
+        unit=unit,
+        dropout=dropout,
+        num_spk=num_spk,
+        emb_D=emb_D,
+        nonlinear=nonlinear,
+    )
+    model.train()
+
+    x = torch.rand(2, 10, input_dim)
+    x_lens = torch.tensor([10, 8], dtype=torch.long)
+
+    masked, flens, others = model(x, ilens=x_lens)
+
+    assert "tf_embedding" in others
+
+    others["tf_embedding"].abs().mean().backward()
+
+
+def test_dpcl_separator_invalid_type():
+    with pytest.raises(ValueError):
+        DPCLSeparator(
+            input_dim=10,
+            rnn_type="rnn",
+            layer=2,
+            unit=10,
+            dropout=0.1,
+            num_spk=2,
+            emb_D=40,
+            nonlinear="fff",
+        )
+
+
+def test_dpcl_separator_output():
+
+    x = torch.rand(2, 10, 10)
+    x_lens = torch.tensor([10, 8], dtype=torch.long)
+
+    for num_spk in range(1, 4):
+        model = DPCLSeparator(
+            input_dim=10,
+            rnn_type="rnn",
+            layer=2,
+            unit=10,
+            dropout=0.1,
+            num_spk=num_spk,
+            emb_D=40,
+            nonlinear="relu",
+        )
+        model.eval()
+        specs, _, others = model(x, x_lens)
+        assert isinstance(specs, list)
+        assert isinstance(others, dict)
+        assert len(specs) == num_spk, len(specs)
+        for n in range(num_spk):
+            assert "tf_embedding" in others
diff --git a/test/espnet2/enh/separator/test_fasnet_separator.py b/test/espnet2/enh/separator/test_fasnet_separator.py
new file mode 100644
index 00000000000..603dc9ce680
--- /dev/null
+++ b/test/espnet2/enh/separator/test_fasnet_separator.py
@@ -0,0 +1,83 @@
+import pytest
+
+import torch
+from torch import Tensor
+
+from espnet2.enh.separator.fasnet_separator import FaSNetSeparator
+
+
+@pytest.mark.parametrize("input_dim", [1])
+@pytest.mark.parametrize("enc_dim", [4])
+@pytest.mark.parametrize("feature_dim", [4])
+@pytest.mark.parametrize("hidden_dim", [4])
+@pytest.mark.parametrize("segment_size", [2])
+@pytest.mark.parametrize("layer", [1, 2])
+@pytest.mark.parametrize("num_spk", [1, 2])
+@pytest.mark.parametrize("win_len", [2, 4])
+@pytest.mark.parametrize("context_len", [2, 4])
+@pytest.mark.parametrize("fasnet_type", ["fasnet", "ifasnet"])
+@pytest.mark.parametrize("sr", [100])
+def test_fasnet_separator_forward_backward_real(
+    input_dim,
+    enc_dim,
+    feature_dim,
+    hidden_dim,
+    segment_size,
+    layer,
+    num_spk,
+    win_len,
+    context_len,
+    fasnet_type,
+    sr,
+):
+    model = FaSNetSeparator(
+        input_dim=input_dim,
+        enc_dim=enc_dim,
+        feature_dim=feature_dim,
+        hidden_dim=hidden_dim,
+        segment_size=segment_size,
+        layer=layer,
+        num_spk=num_spk,
+        win_len=win_len,
+        context_len=context_len,
+        fasnet_type=fasnet_type,
+        sr=sr,
+    )
+    model.train()
+
+    x = torch.rand(2, 400, 4)
+    x_lens = torch.tensor([400, 300], dtype=torch.long)
+
+    separated, flens, others = model(x, ilens=x_lens)
+
+    assert isinstance(separated[0], Tensor)
+    assert len(separated) == num_spk
+
+    separated[0].abs().mean().backward()
+
+
+@pytest.mark.parametrize("fasnet_type", ["fasnet", "ifasnet"])
+def test_fasnet_separator_output(fasnet_type):
+
+    x = torch.rand(2, 800, 4)
+    x_lens = torch.tensor([10, 8], dtype=torch.long)
+
+    for num_spk in range(1, 3):
+        model = FaSNetSeparator(
+            input_dim=16,
+            enc_dim=16,
+            feature_dim=16,
+            hidden_dim=16,
+            segment_size=4,
+            layer=2,
+            num_spk=num_spk,
+            win_len=2,
+            context_len=2,
+            fasnet_type=fasnet_type,
+            sr=100,
+        )
+        model.eval()
+        specs, _, others = model(x, x_lens)
+        assert isinstance(specs, list)
+        assert isinstance(others, dict)
+        assert x[:, :, 0].shape == specs[0].shape
diff --git a/test/espnet2/enh/separator/test_skim_separator.py b/test/espnet2/enh/separator/test_skim_separator.py
new file mode 100644
index 00000000000..e1594cd5620
--- /dev/null
+++ b/test/espnet2/enh/separator/test_skim_separator.py
@@ -0,0 +1,142 @@
+import pytest
+
+import torch
+from torch import Tensor
+from torch_complex import ComplexTensor
+
+from espnet2.enh.separator.skim_separator import SkiMSeparator
+
+
+@pytest.mark.parametrize("input_dim", [5])
+@pytest.mark.parametrize("layer", [1, 3])
+@pytest.mark.parametrize("causal", [True, False])
+@pytest.mark.parametrize("unit", [8])
+@pytest.mark.parametrize("dropout", [0.0, 0.2])
+@pytest.mark.parametrize("num_spk", [1, 2])
+@pytest.mark.parametrize("nonlinear", ["relu", "sigmoid", "tanh"])
+@pytest.mark.parametrize("mem_type", ["hc", "c", "h", None])
+@pytest.mark.parametrize("segment_size", [2, 4])
+@pytest.mark.parametrize("seg_overlap", [False, True])
+def test_skim_separator_forward_backward_complex(
+    input_dim,
+    layer,
+    causal,
+    unit,
+    dropout,
+    num_spk,
+    nonlinear,
+    mem_type,
+    segment_size,
+    seg_overlap,
+):
+    model = SkiMSeparator(
+        input_dim=input_dim,
+        causal=causal,
+        num_spk=num_spk,
+        nonlinear=nonlinear,
+        layer=layer,
+        unit=unit,
+        segment_size=segment_size,
+        dropout=dropout,
+        mem_type=mem_type,
+        seg_overlap=seg_overlap,
+    )
+    model.train()
+
+    real = torch.rand(2, 10, input_dim)
+    imag = torch.rand(2, 10, input_dim)
+    x = ComplexTensor(real, imag)
+    x_lens = torch.tensor([10, 8], dtype=torch.long)
+
+    masked, flens, others = model(x, ilens=x_lens)
+
+    assert isinstance(masked[0], ComplexTensor)
+    assert len(masked) == num_spk
+
+    masked[0].abs().mean().backward()
+
+
+@pytest.mark.parametrize("input_dim", [5])
+@pytest.mark.parametrize("layer", [1, 3])
+@pytest.mark.parametrize("causal", [True, False])
+@pytest.mark.parametrize("unit", [8])
+@pytest.mark.parametrize("dropout", [0.0, 0.2])
+@pytest.mark.parametrize("num_spk", [1, 2])
+@pytest.mark.parametrize("nonlinear", ["relu", "sigmoid", "tanh"])
+@pytest.mark.parametrize("mem_type", ["hc", "c", "h", "id", None])
+@pytest.mark.parametrize("segment_size", [2, 4])
+@pytest.mark.parametrize("seg_overlap", [False, True])
+def test_skim_separator_forward_backward_real(
+    input_dim,
+    layer,
+    causal,
+    unit,
+    dropout,
+    num_spk,
+    nonlinear,
+    mem_type,
+    segment_size,
+    seg_overlap,
+):
+    model = SkiMSeparator(
+        input_dim=input_dim,
+        causal=causal,
+        num_spk=num_spk,
+        nonlinear=nonlinear,
+        layer=layer,
+        unit=unit,
+        segment_size=segment_size,
+        dropout=dropout,
+        mem_type=mem_type,
+        seg_overlap=seg_overlap,
+    )
+    model.train()
+
+    x = torch.rand(2, 10, input_dim)
+    x_lens = torch.tensor([10, 8], dtype=torch.long)
+
+    masked, flens, others = model(x, ilens=x_lens)
+
+    assert isinstance(masked[0], Tensor)
+    assert len(masked) == num_spk
+
+    masked[0].abs().mean().backward()
+
+
+def test_skim_separator_invalid_type():
+    with pytest.raises(ValueError):
+        SkiMSeparator(
+            input_dim=10,
+            layer=2,
+            unit=10,
+            dropout=0.1,
+            num_spk=2,
+            nonlinear="fff",
+            mem_type="aaa",
+            segment_size=2,
+        )
+
+
+def test_skim_separator_output():
+
+    x = torch.rand(2, 10, 10)
+    x_lens = torch.tensor([10, 8], dtype=torch.long)
+
+    for num_spk in range(1, 3):
+        model = SkiMSeparator(
+            input_dim=10,
+            layer=2,
+            unit=10,
+            dropout=0.1,
+            num_spk=2,
+            nonlinear="relu",
+            segment_size=2,
+        )
+        model.eval()
+        specs, _, others = model(x, x_lens)
+        assert isinstance(specs, list)
+        assert isinstance(others, dict)
+        assert x.shape == specs[0].shape
+        for n in range(num_spk):
+            assert "mask_spk{}".format(n + 1) in others
+            assert specs[n].shape == others["mask_spk{}".format(n + 1)].shape
diff --git a/test/espnet2/enh/separator/test_svoice_separator.py b/test/espnet2/enh/separator/test_svoice_separator.py
new file mode 100644
index 00000000000..b2fb191856c
--- /dev/null
+++ b/test/espnet2/enh/separator/test_svoice_separator.py
@@ -0,0 +1,95 @@
+import pytest
+
+import torch
+from torch import Tensor
+
+from espnet2.enh.separator.svoice_separator import SVoiceSeparator
+
+
+@pytest.mark.parametrize("input_dim", [1])
+@pytest.mark.parametrize("enc_dim", [4])
+@pytest.mark.parametrize("kernel_size", [4])
+@pytest.mark.parametrize("hidden_size", [4])
+@pytest.mark.parametrize("num_spk", [1, 2])
+@pytest.mark.parametrize("num_layers", [1, 2])
+@pytest.mark.parametrize("segment_size", [2])
+@pytest.mark.parametrize("bidirectional", [False])
+@pytest.mark.parametrize("input_normalize", [False])
+def test_svoice_separator_forward_backward(
+    input_dim,
+    enc_dim,
+    kernel_size,
+    hidden_size,
+    num_spk,
+    num_layers,
+    segment_size,
+    bidirectional,
+    input_normalize,
+):
+    model = SVoiceSeparator(
+        input_dim=input_dim,
+        enc_dim=enc_dim,
+        kernel_size=kernel_size,
+        hidden_size=hidden_size,
+        num_spk=num_spk,
+        num_layers=num_layers,
+        segment_size=segment_size,
+        bidirectional=bidirectional,
+        input_normalize=input_normalize,
+    )
+    model.train()
+
+    x = torch.rand(2, 800)
+    x_lens = torch.tensor([400, 300], dtype=torch.long)
+
+    separated, _, _ = model(x, ilens=x_lens)
+
+    assert isinstance(separated[0][0], Tensor)
+    assert len(separated) == num_layers
+
+    separated[0][0].mean().backward()
+
+
+def test_svoice_separator_output_train():
+    x = torch.rand(2, 800)
+    x_lens = torch.tensor([10, 8], dtype=torch.long)
+
+    for num_spk in range(1, 3):
+        model = SVoiceSeparator(
+            input_dim=12,
+            enc_dim=8,
+            kernel_size=8,
+            hidden_size=8,
+            num_spk=num_spk,
+            num_layers=4,
+            segment_size=2,
+            bidirectional=False,
+            input_normalize=False,
+        )
+        model.train()
+        waveforms, _, _ = model(x, x_lens)
+        assert isinstance(waveforms, list)
+        assert isinstance(waveforms[0], list)
+        assert x[0].shape == waveforms[0][0][0].shape
+
+
+def test_svoice_separator_output_eval():
+    x = torch.rand(2, 800)
+    x_lens = torch.tensor([10, 8], dtype=torch.long)
+
+    for num_spk in range(1, 3):
+        model = SVoiceSeparator(
+            input_dim=12,
+            enc_dim=8,
+            kernel_size=8,
+            hidden_size=8,
+            num_spk=num_spk,
+            num_layers=4,
+            segment_size=2,
+            bidirectional=False,
+            input_normalize=False,
+        )
+        model.eval()
+        waveforms, _, _ = model(x, x_lens)
+        assert isinstance(waveforms, list)
+        assert x[0].shape == waveforms[0][0].shape
diff --git a/test/espnet2/enh/test_espnet_enh_s2t_model.py b/test/espnet2/enh/test_espnet_enh_s2t_model.py
new file mode 100644
index 00000000000..5f7df398130
--- /dev/null
+++ b/test/espnet2/enh/test_espnet_enh_s2t_model.py
@@ -0,0 +1,129 @@
+import pytest
+import torch
+
+from espnet2.asr.ctc import CTC
+from espnet2.asr.decoder.transformer_decoder import TransformerDecoder
+from espnet2.asr.encoder.transformer_encoder import TransformerEncoder
+from espnet2.asr.espnet_model import ESPnetASRModel
+from espnet2.asr.frontend.default import DefaultFrontend
+from espnet2.enh.decoder.stft_decoder import STFTDecoder
+from espnet2.enh.encoder.stft_encoder import STFTEncoder
+from espnet2.enh.espnet_enh_s2t_model import ESPnetEnhS2TModel
+from espnet2.enh.espnet_model import ESPnetEnhancementModel
+from espnet2.enh.loss.criterions.time_domain import SISNRLoss
+from espnet2.enh.loss.wrappers.fixed_order import FixedOrderSolver
+from espnet2.enh.separator.rnn_separator import RNNSeparator
+
+
+enh_stft_encoder = STFTEncoder(
+    n_fft=32,
+    hop_length=16,
+)
+
+enh_stft_decoder = STFTDecoder(
+    n_fft=32,
+    hop_length=16,
+)
+
+enh_rnn_separator = RNNSeparator(
+    input_dim=17,
+    layer=1,
+    unit=10,
+    num_spk=1,
+)
+
+si_snr_loss = SISNRLoss()
+
+fix_order_solver = FixedOrderSolver(criterion=si_snr_loss)
+
+default_frontend = DefaultFrontend(
+    fs=300,
+    n_fft=32,
+    win_length=32,
+    hop_length=24,
+    n_mels=32,
+)
+
+token_list = ["<blank>", "<space>", "a", "e", "i", "o", "u", "<sos/eos>"]
+
+asr_transformer_encoder = TransformerEncoder(
+    32,
+    output_size=16,
+    linear_units=16,
+    num_blocks=2,
+)
+
+asr_transformer_decoder = TransformerDecoder(
+    len(token_list),
+    16,
+    linear_units=16,
+    num_blocks=2,
+)
+
+asr_ctc = CTC(odim=len(token_list), encoder_output_size=16)
+
+
+@pytest.mark.parametrize(
+    "enh_encoder, enh_decoder",
+    [(enh_stft_encoder, enh_stft_decoder)],
+)
+@pytest.mark.parametrize("enh_separator", [enh_rnn_separator])
+@pytest.mark.parametrize("training", [True, False])
+@pytest.mark.parametrize("loss_wrappers", [[fix_order_solver]])
+@pytest.mark.parametrize("frontend", [default_frontend])
+@pytest.mark.parametrize("s2t_encoder", [asr_transformer_encoder])
+@pytest.mark.parametrize("s2t_decoder", [asr_transformer_decoder])
+@pytest.mark.parametrize("s2t_ctc", [asr_ctc])
+def test_enh_asr_model(
+    enh_encoder,
+    enh_decoder,
+    enh_separator,
+    training,
+    loss_wrappers,
+    frontend,
+    s2t_encoder,
+    s2t_decoder,
+    s2t_ctc,
+):
+    inputs = torch.randn(2, 300)
+    ilens = torch.LongTensor([300, 200])
+    speech_ref = torch.randn(2, 300).float()
+    text = torch.LongTensor([[1, 2, 3, 4, 5], [5, 4, 3, 2, 1]])
+    text_lengths = torch.LongTensor([5, 5])
+    enh_model = ESPnetEnhancementModel(
+        encoder=enh_encoder,
+        separator=enh_separator,
+        decoder=enh_decoder,
+        loss_wrappers=loss_wrappers,
+    )
+    s2t_model = ESPnetASRModel(
+        vocab_size=len(token_list),
+        token_list=token_list,
+        frontend=frontend,
+        encoder=s2t_encoder,
+        decoder=s2t_decoder,
+        ctc=s2t_ctc,
+        specaug=None,
+        normalize=None,
+        preencoder=None,
+        postencoder=None,
+        joint_network=None,
+    )
+    enh_s2t_model = ESPnetEnhS2TModel(
+        enh_model=enh_model,
+        s2t_model=s2t_model,
+    )
+
+    if training:
+        enh_s2t_model.train()
+    else:
+        enh_s2t_model.eval()
+
+    kwargs = {
+        "speech": inputs,
+        "speech_lengths": ilens,
+        "speech_ref1": speech_ref,
+        "text": text,
+        "text_lengths": text_lengths,
+    }
+    loss, stats, weight = enh_s2t_model(**kwargs)
diff --git a/test/espnet2/enh/test_espnet_model.py b/test/espnet2/enh/test_espnet_model.py
index b51ea67fd7d..6985ab63e36 100644
--- a/test/espnet2/enh/test_espnet_model.py
+++ b/test/espnet2/enh/test_espnet_model.py
@@ -4,62 +4,89 @@
 import torch
 
 from espnet2.enh.decoder.conv_decoder import ConvDecoder
+from espnet2.enh.decoder.null_decoder import NullDecoder
 from espnet2.enh.decoder.stft_decoder import STFTDecoder
 from espnet2.enh.encoder.conv_encoder import ConvEncoder
+from espnet2.enh.encoder.null_encoder import NullEncoder
 from espnet2.enh.encoder.stft_encoder import STFTEncoder
 from espnet2.enh.espnet_model import ESPnetEnhancementModel
 from espnet2.enh.loss.criterions.tf_domain import FrequencyDomainL1
 from espnet2.enh.loss.criterions.tf_domain import FrequencyDomainMSE
 from espnet2.enh.loss.criterions.time_domain import SISNRLoss
 from espnet2.enh.loss.wrappers.fixed_order import FixedOrderSolver
+from espnet2.enh.loss.wrappers.multilayer_pit_solver import MultiLayerPITSolver
 from espnet2.enh.loss.wrappers.pit_solver import PITSolver
+from espnet2.enh.separator.dc_crn_separator import DC_CRNSeparator
+from espnet2.enh.separator.dccrn_separator import DCCRNSeparator
 from espnet2.enh.separator.dprnn_separator import DPRNNSeparator
 from espnet2.enh.separator.neural_beamformer import NeuralBeamformer
 from espnet2.enh.separator.rnn_separator import RNNSeparator
+from espnet2.enh.separator.svoice_separator import SVoiceSeparator
 from espnet2.enh.separator.tcn_separator import TCNSeparator
 from espnet2.enh.separator.transformer_separator import TransformerSeparator
 
+
 is_torch_1_9_plus = LooseVersion(torch.__version__) >= LooseVersion("1.9.0")
 
 
 stft_encoder = STFTEncoder(
-    n_fft=28,
+    n_fft=32,
     hop_length=16,
 )
 
 stft_encoder_bultin_complex = STFTEncoder(
-    n_fft=28,
+    n_fft=32,
     hop_length=16,
     use_builtin_complex=True,
 )
 
 stft_decoder = STFTDecoder(
-    n_fft=28,
+    n_fft=32,
     hop_length=16,
 )
 
 conv_encoder = ConvEncoder(
-    channel=15,
-    kernel_size=32,
-    stride=16,
+    channel=17,
+    kernel_size=36,
+    stride=18,
 )
 
 conv_decoder = ConvDecoder(
-    channel=15,
-    kernel_size=32,
-    stride=16,
+    channel=17,
+    kernel_size=36,
+    stride=18,
 )
 
+null_encoder = NullEncoder()
+
+null_decoder = NullDecoder()
+
 rnn_separator = RNNSeparator(
-    input_dim=15,
+    input_dim=17,
     layer=1,
     unit=10,
 )
 
-dprnn_separator = DPRNNSeparator(input_dim=15, layer=1, unit=10, segment_size=4)
+dc_crn_separator = DC_CRNSeparator(input_dim=17, input_channels=[2, 2, 4])
+
+dccrn_separator = DCCRNSeparator(input_dim=17, num_spk=1, kernel_num=[32, 64, 128])
+
+dprnn_separator = DPRNNSeparator(input_dim=17, layer=1, unit=10, segment_size=4)
+
+svoice_separator = SVoiceSeparator(
+    input_dim=17,
+    enc_dim=4,
+    kernel_size=4,
+    hidden_size=4,
+    num_spk=2,
+    num_layers=2,
+    segment_size=4,
+    bidirectional=False,
+    input_normalize=False,
+)
 
 tcn_separator = TCNSeparator(
-    input_dim=15,
+    input_dim=17,
     layer=2,
     stack=1,
     bottleneck_dim=10,
@@ -68,19 +95,19 @@
 )
 
 transformer_separator = TransformerSeparator(
-    input_dim=15,
+    input_dim=17,
     adim=8,
     aheads=2,
     layers=2,
     linear_units=10,
 )
 
-
 si_snr_loss = SISNRLoss()
 tf_mse_loss = FrequencyDomainMSE()
 tf_l1_loss = FrequencyDomainL1()
 
 pit_wrapper = PITSolver(criterion=si_snr_loss)
+multilayer_pit_solver = MultiLayerPITSolver(criterion=si_snr_loss)
 fix_order_solver = FixedOrderSolver(criterion=tf_mse_loss)
 
 
@@ -93,15 +120,66 @@
     ],
 )
 @pytest.mark.parametrize(
-    "separator", [rnn_separator, dprnn_separator, tcn_separator, transformer_separator]
+    "separator",
+    [
+        rnn_separator,
+        dprnn_separator,
+        dc_crn_separator,
+        dccrn_separator,
+        tcn_separator,
+        transformer_separator,
+    ],
 )
 @pytest.mark.parametrize("training", [True, False])
 @pytest.mark.parametrize("loss_wrappers", [[pit_wrapper, fix_order_solver]])
 def test_single_channel_model(encoder, decoder, separator, training, loss_wrappers):
+    if not isinstance(encoder, STFTEncoder) and isinstance(
+        separator, (DCCRNSeparator, DC_CRNSeparator)
+    ):
+        # skip because DCCRNSeparator and DC_CRNSeparator only work
+        # for complex spectrum features
+        return
     inputs = torch.randn(2, 300)
     ilens = torch.LongTensor([300, 200])
     speech_refs = [torch.randn(2, 300).float(), torch.randn(2, 300).float()]
+    enh_model = ESPnetEnhancementModel(
+        encoder=encoder,
+        separator=separator,
+        decoder=decoder,
+        loss_wrappers=loss_wrappers,
+    )
+
+    if training:
+        enh_model.train()
+    else:
+        enh_model.eval()
 
+    kwargs = {
+        "speech_mix": inputs,
+        "speech_mix_lengths": ilens,
+        **{"speech_ref{}".format(i + 1): speech_refs[i] for i in range(2)},
+    }
+    loss, stats, weight = enh_model(**kwargs)
+
+
+@pytest.mark.parametrize(
+    "encoder, decoder",
+    [
+        (null_encoder, null_decoder),
+    ],
+)
+@pytest.mark.parametrize(
+    "separator",
+    [
+        svoice_separator,
+    ],
+)
+@pytest.mark.parametrize("training", [True, False])
+@pytest.mark.parametrize("loss_wrappers", [[multilayer_pit_solver]])
+def test_svoice_model(encoder, decoder, separator, training, loss_wrappers):
+    inputs = torch.randn(2, 300)
+    ilens = torch.LongTensor([300, 200])
+    speech_refs = [torch.randn(2, 300).float(), torch.randn(2, 300).float()]
     enh_model = ESPnetEnhancementModel(
         encoder=encoder,
         separator=separator,
@@ -184,13 +262,13 @@ def test_forward_with_beamformer_net(
         # `mask_type` has no effect when `loss_type` is not "mask..."
         return
     if not is_torch_1_9_plus and use_builtin_complex:
-        # builtin complex support is only available in PyTorch 1.8+
+        # builtin complex support is only well supported in PyTorch 1.9+
         return
 
     ch = 3
     inputs = random_speech[..., :ch].float()
     ilens = torch.LongTensor([16, 12])
-    speech_refs = [torch.randn(2, 16, ch).float() for spk in range(num_spk)]
+    speech_refs = [torch.randn(2, 16, dtype=torch.float) for spk in range(num_spk)]
     noise_ref1 = torch.randn(2, 16, ch, dtype=torch.float)
     dereverb_ref1 = torch.randn(2, 16, ch, dtype=torch.float)
     encoder = STFTEncoder(
@@ -234,7 +312,8 @@ def test_forward_with_beamformer_net(
         "speech_mix": inputs,
         "speech_mix_lengths": ilens,
         **{"speech_ref{}".format(i + 1): speech_refs[i] for i in range(num_spk)},
-        "noise_ref1": noise_ref1,
         "dereverb_ref1": dereverb_ref1,
     }
     loss, stats, weight = enh_model(**kwargs)
+    if mask_type in ("IBM", "IRM"):
+        loss, stats, weight = enh_model(**kwargs, noise_ref1=noise_ref1)
diff --git a/test/espnet2/gan_tts/hifigan/test_hifigan.py b/test/espnet2/gan_tts/hifigan/test_hifigan.py
index 3ee5cd2a17e..1bfc7308103 100644
--- a/test/espnet2/gan_tts/hifigan/test_hifigan.py
+++ b/test/espnet2/gan_tts/hifigan/test_hifigan.py
@@ -182,19 +182,16 @@ def test_hifigan_generator_and_discriminator_and_loss(
     not is_parallel_wavegan_available, reason="parallel_wavegan is not installed."
 )
 def test_parallel_wavegan_compatibility():
-    from parallel_wavegan.utils import download_pretrained_model
-    from parallel_wavegan.utils import load_model
-
-    ckpt_path = download_pretrained_model("ljspeech_hifigan.v1")
-    state_dict = torch.load(ckpt_path, map_location="cpu")["model"]["generator"]
-    model_pwg = load_model(ckpt_path)
-    model_espnet2 = HiFiGANGenerator()
-    model_espnet2.load_state_dict(state_dict)
+    from parallel_wavegan.models import HiFiGANGenerator as PWGHiFiGANGenerator
+
+    model_pwg = PWGHiFiGANGenerator(**make_hifigan_generator_args())
+    model_espnet2 = HiFiGANGenerator(**make_hifigan_generator_args())
+    model_espnet2.load_state_dict(model_pwg.state_dict())
     model_pwg.eval()
     model_espnet2.eval()
 
     with torch.no_grad():
-        c = torch.randn(5, 80)
+        c = torch.randn(3, 5)
         out_pwg = model_pwg.inference(c)
         out_espnet2 = model_espnet2.inference(c)
         np.testing.assert_array_equal(
diff --git a/test/espnet2/gan_tts/melgan/test_melgan.py b/test/espnet2/gan_tts/melgan/test_melgan.py
index 4ea22928e4b..81d5874007b 100644
--- a/test/espnet2/gan_tts/melgan/test_melgan.py
+++ b/test/espnet2/gan_tts/melgan/test_melgan.py
@@ -134,14 +134,11 @@ def test_melgan_generator_and_discriminator(dict_g, dict_d):
     not is_parallel_wavegan_available, reason="parallel_wavegan is not installed."
 )
 def test_parallel_wavegan_compatibility():
-    from parallel_wavegan.utils import download_pretrained_model
-    from parallel_wavegan.utils import load_model
-
-    ckpt_path = download_pretrained_model("ljspeech_melgan.v1")
-    state_dict = torch.load(ckpt_path, map_location="cpu")["model"]["generator"]
-    model_pwg = load_model(ckpt_path)
-    model_espnet2 = MelGANGenerator()
-    model_espnet2.load_state_dict(state_dict)
+    from parallel_wavegan.models import MelGANGenerator as PWGMelGANGenerator
+
+    model_pwg = PWGMelGANGenerator(**make_melgan_generator_args())
+    model_espnet2 = MelGANGenerator(**make_melgan_generator_args())
+    model_espnet2.load_state_dict(model_pwg.state_dict())
     model_pwg.eval()
     model_espnet2.eval()
 
diff --git a/test/espnet2/gan_tts/parallel_wavegan/test_parallel_wavegan.py b/test/espnet2/gan_tts/parallel_wavegan/test_parallel_wavegan.py
index 406f4fe30bc..098ce45ea8c 100644
--- a/test/espnet2/gan_tts/parallel_wavegan/test_parallel_wavegan.py
+++ b/test/espnet2/gan_tts/parallel_wavegan/test_parallel_wavegan.py
@@ -134,20 +134,19 @@ def test_parallel_wavegan_generator_and_discriminator(dict_g, dict_d):
     not is_parallel_wavegan_available, reason="parallel_wavegan is not installed."
 )
 def test_parallel_wavegan_compatibility():
-    from parallel_wavegan.utils import download_pretrained_model
-    from parallel_wavegan.utils import load_model
-
-    ckpt_path = download_pretrained_model("ljspeech_parallel_wavegan.v1")
-    state_dict = torch.load(ckpt_path, map_location="cpu")["model"]["generator"]
-    model_pwg = load_model(ckpt_path)
-    model_espnet2 = ParallelWaveGANGenerator()
-    model_espnet2.load_state_dict(state_dict)
+    from parallel_wavegan.models import (
+        ParallelWaveGANGenerator as PWGParallelWaveGANGenerator,  # NOQA
+    )
+
+    model_pwg = PWGParallelWaveGANGenerator(**make_generator_args())
+    model_espnet2 = ParallelWaveGANGenerator(**make_generator_args())
+    model_espnet2.load_state_dict(model_pwg.state_dict())
     model_pwg.eval()
     model_espnet2.eval()
 
     with torch.no_grad():
-        z = torch.randn(5 * 256, 1)
-        c = torch.randn(5, 80)
+        z = torch.randn(3 * 16, 1)
+        c = torch.randn(3, 10)
         out_pwg = model_pwg.inference(c, z)
         out_espnet2 = model_espnet2.inference(c, z)
         np.testing.assert_array_equal(
diff --git a/test/espnet2/gan_tts/style_melgan/test_style_melgan.py b/test/espnet2/gan_tts/style_melgan/test_style_melgan.py
index 2fabab14892..8f8f3f546f2 100644
--- a/test/espnet2/gan_tts/style_melgan/test_style_melgan.py
+++ b/test/espnet2/gan_tts/style_melgan/test_style_melgan.py
@@ -124,19 +124,16 @@ def test_style_melgan_trainable(dict_g, dict_d):
     not is_parallel_wavegan_available, reason="parallel_wavegan is not installed."
 )
 def test_parallel_wavegan_compatibility():
-    from parallel_wavegan.utils import download_pretrained_model
-    from parallel_wavegan.utils import load_model
-
-    ckpt_path = download_pretrained_model("ljspeech_style_melgan.v1")
-    state_dict = torch.load(ckpt_path, map_location="cpu")["model"]["generator"]
-    model_pwg = load_model(ckpt_path)
-    model_espnet2 = StyleMelGANGenerator()
-    model_espnet2.load_state_dict(state_dict)
+    from parallel_wavegan.models import StyleMelGANGenerator as PWGStyleMelGANGenerator
+
+    model_pwg = PWGStyleMelGANGenerator(**make_style_melgan_generator_args())
+    model_espnet2 = StyleMelGANGenerator(**make_style_melgan_generator_args())
+    model_espnet2.load_state_dict(model_pwg.state_dict())
     model_pwg.eval()
     model_espnet2.eval()
 
     with torch.no_grad():
-        c = torch.randn(5, 80)
+        c = torch.randn(3, 5)
         torch.manual_seed(1)
         out_pwg = model_pwg.inference(c)
         torch.manual_seed(1)
diff --git a/test/espnet2/layers/test_global_mvn.py b/test/espnet2/layers/test_global_mvn.py
index bb570df234a..258bd484fcb 100644
--- a/test/espnet2/layers/test_global_mvn.py
+++ b/test/espnet2/layers/test_global_mvn.py
@@ -17,7 +17,7 @@ def stats_file(tmp_path: Path):
     x = np.random.randn(count, 80)
     s = x.sum(0)
     s = np.pad(s, [0, 1], mode="constant", constant_values=count)
-    s2 = (x ** 2).sum(0)
+    s2 = (x**2).sum(0)
     s2 = np.pad(s2, [0, 1], mode="constant", constant_values=0.0)
 
     stats = np.stack([s, s2])
@@ -34,7 +34,7 @@ def stats_file2(tmp_path: Path):
     np.random.seed(0)
     x = np.random.randn(count, 80)
     s = x.sum(0)
-    s2 = (x ** 2).sum(0)
+    s2 = (x**2).sum(0)
 
     np.savez(p, sum=s, sum_square=s2, count=count)
     return p
diff --git a/test/espnet2/tasks/test_abs_task.py b/test/espnet2/tasks/test_abs_task.py
index b03e35b29f3..7a9297f78e2 100644
--- a/test/espnet2/tasks/test_abs_task.py
+++ b/test/espnet2/tasks/test_abs_task.py
@@ -17,7 +17,7 @@ def __init__(self):
     def collect_feats(self):
         return {}
 
-    def forward(self, x, x_lengths):
+    def forward(self, x, x_lengths, **kwargs):
         x = self.layer1(x)
         x = self.layer2(x)
         retval = {
diff --git a/test/espnet2/tasks/test_diar.py b/test/espnet2/tasks/test_diar.py
new file mode 100644
index 00000000000..202a24f242c
--- /dev/null
+++ b/test/espnet2/tasks/test_diar.py
@@ -0,0 +1,36 @@
+import pytest
+
+from espnet2.tasks.diar import DiarizationTask
+
+
+def test_add_arguments():
+    DiarizationTask.get_parser()
+
+
+def test_add_arguments_help():
+    parser = DiarizationTask.get_parser()
+    with pytest.raises(SystemExit):
+        parser.parse_args(["--help"])
+
+
+def test_main_help():
+    with pytest.raises(SystemExit):
+        DiarizationTask.main(cmd=["--help"])
+
+
+def test_main_print_config():
+    with pytest.raises(SystemExit):
+        DiarizationTask.main(cmd=["--print_config"])
+
+
+def test_main_with_no_args():
+    with pytest.raises(SystemExit):
+        DiarizationTask.main(cmd=[])
+
+
+def test_print_config_and_load_it(tmp_path):
+    config_file = tmp_path / "config.yaml"
+    with config_file.open("w") as f:
+        DiarizationTask.print_config(f)
+    parser = DiarizationTask.get_parser()
+    parser.parse_args(["--config", str(config_file)])
diff --git a/test/espnet2/tasks/test_enh_s2t.py b/test/espnet2/tasks/test_enh_s2t.py
new file mode 100644
index 00000000000..1d4622d21d5
--- /dev/null
+++ b/test/espnet2/tasks/test_enh_s2t.py
@@ -0,0 +1,36 @@
+import pytest
+
+from espnet2.tasks.enh_s2t import EnhS2TTask
+
+
+def test_add_arguments():
+    EnhS2TTask.get_parser()
+
+
+def test_add_arguments_help():
+    parser = EnhS2TTask.get_parser()
+    with pytest.raises(SystemExit):
+        parser.parse_args(["--help"])
+
+
+def test_main_help():
+    with pytest.raises(SystemExit):
+        EnhS2TTask.main(cmd=["--help"])
+
+
+def test_main_print_config():
+    with pytest.raises(SystemExit):
+        EnhS2TTask.main(cmd=["--print_config"])
+
+
+def test_main_with_no_args():
+    with pytest.raises(SystemExit):
+        EnhS2TTask.main(cmd=[])
+
+
+def test_print_config_and_load_it(tmp_path):
+    config_file = tmp_path / "config.yaml"
+    with config_file.open("w") as f:
+        EnhS2TTask.print_config(f)
+    parser = EnhS2TTask.get_parser()
+    parser.parse_args(["--config", str(config_file)])
diff --git a/test/espnet2/text/test_phoneme_tokenizer.py b/test/espnet2/text/test_phoneme_tokenizer.py
index b6ef1258857..35ec36f22b4 100644
--- a/test/espnet2/text/test_phoneme_tokenizer.py
+++ b/test/espnet2/text/test_phoneme_tokenizer.py
@@ -312,7 +312,7 @@ def test_text2tokens(phoneme_tokenizer: PhonemeTokenizer):
             "。",
         ]
     elif phoneme_tokenizer.g2p_type == "espeak_ng_arabic":
-        input = u"السلام عليكم"
+        input = "السلام عليكم"
         output = ["ʔ", "a", "s", "s", "a", "l", "ˈaː", "m", "ʕ", "l", "ˈiː", "k", "m"]
     elif phoneme_tokenizer.g2p_type == "espeak_ng_german":
         input = "Das hört sich gut an."
diff --git a/test/espnet2/train/test_reporter.py b/test/espnet2/train/test_reporter.py
index 09df65a74c1..c928c52523a 100644
--- a/test/espnet2/train/test_reporter.py
+++ b/test/espnet2/train/test_reporter.py
@@ -275,7 +275,7 @@ def test_matplotlib_plot(tmp_path: Path):
 def test_tensorboard_add_scalar(tmp_path: Path):
     reporter = Reporter()
     reporter.set_epoch(1)
-    key1 = uuid.uuid4().hex
+    key1 = "train"
     with reporter.observe(key1) as sub:
         stats1 = {"aa": 0.6}
         sub.register(stats1)
diff --git a/test/test_e2e_asr.py b/test/test_e2e_asr.py
index fe4f8c09244..a9f29478298 100644
--- a/test/test_e2e_asr.py
+++ b/test/test_e2e_asr.py
@@ -60,7 +60,7 @@ def make_arg(**kwargs):
         streaming_onset_margin=2,
         streaming_offset_margin=2,
         verbose=2,
-        char_list=[u"あ", u"い", u"う", u"え", u"お"],
+        char_list=["あ", "い", "う", "え", "お"],
         outdir=None,
         ctc_type="warpctc",
         report_cer=False,
@@ -320,7 +320,7 @@ def test_gradient_noise_injection(module):
         import espnet.nets.pytorch_backend.e2e_asr as m
     else:
         import espnet.nets.chainer_backend.e2e_asr as m
-    batchset = make_batchset(dummy_json, 2, 2 ** 10, 2 ** 10, shortest_first=True)
+    batchset = make_batchset(dummy_json, 2, 2**10, 2**10, shortest_first=True)
     model = m.E2E(10, 5, args)
     model_org = m.E2E(10, 5, args_org)
     for batch in batchset:
@@ -343,7 +343,7 @@ def test_sortagrad_trainable(module):
         import espnet.nets.pytorch_backend.e2e_asr as m
     else:
         import espnet.nets.chainer_backend.e2e_asr as m
-    batchset = make_batchset(dummy_json, 2, 2 ** 10, 2 ** 10, shortest_first=True)
+    batchset = make_batchset(dummy_json, 2, 2**10, 2**10, shortest_first=True)
     model = m.E2E(idim, odim, args)
     for batch in batchset:
         loss = model(*convert_batch(batch, module, idim=idim, odim=odim))
diff --git a/test/test_e2e_asr_mulenc.py b/test/test_e2e_asr_mulenc.py
index 940bae8d7a9..88a04bac456 100644
--- a/test/test_e2e_asr_mulenc.py
+++ b/test/test_e2e_asr_mulenc.py
@@ -61,7 +61,7 @@ def make_arg(num_encs, **kwargs):
         streaming_onset_margin=2,
         streaming_offset_margin=2,
         verbose=2,
-        char_list=[u"あ", u"い"],
+        char_list=["あ", "い"],
         outdir=None,
         ctc_type="warpctc",
         report_cer=False,
@@ -337,7 +337,7 @@ def test_gradient_noise_injection(module, num_encs):
     )
     import espnet.nets.pytorch_backend.e2e_asr_mulenc as m
 
-    batchset = make_batchset(dummy_json, 2, 2 ** 10, 2 ** 10, shortest_first=True)
+    batchset = make_batchset(dummy_json, 2, 2**10, 2**10, shortest_first=True)
     model = m.E2E([2 for _ in range(num_encs)], 2, args)
     model_org = m.E2E([2 for _ in range(num_encs)], 2, args_org)
     for batch in batchset:
@@ -358,7 +358,7 @@ def test_sortagrad_trainable(module, num_encs):
     dummy_json = make_dummy_json(6, [2, 3], [2, 3], idim=2, odim=2, num_inputs=num_encs)
     import espnet.nets.pytorch_backend.e2e_asr_mulenc as m
 
-    batchset = make_batchset(dummy_json, 2, 2 ** 10, 2 ** 10, shortest_first=True)
+    batchset = make_batchset(dummy_json, 2, 2**10, 2**10, shortest_first=True)
     model = m.E2E([2 for _ in range(num_encs)], 2, args)
     num_utts = 0
     for batch in batchset:
@@ -565,7 +565,7 @@ def test_calculate_plot_attention_ctc(module, num_encs, model_dict):
     dummy_json = make_dummy_json(
         num_encs, [2, 3], [2, 3], idim=2, odim=2, num_inputs=num_encs
     )
-    batchset = make_batchset(dummy_json, 2, 2 ** 10, 2 ** 10, shortest_first=True)
+    batchset = make_batchset(dummy_json, 2, 2**10, 2**10, shortest_first=True)
     att_ws = model.calculate_all_attentions(
         *convert_batch(batchset[0], "pytorch", idim=2, odim=2, num_inputs=num_encs)
     )
diff --git a/test/test_e2e_mt.py b/test/test_e2e_mt.py
index 26d3c75b70c..4c2158b1856 100644
--- a/test/test_e2e_mt.py
+++ b/test/test_e2e_mt.py
@@ -47,7 +47,7 @@ def make_arg(**kwargs):
         ctc_weight=0.0,  # dummy
         ctc_window_margin=0,  # dummy
         verbose=2,
-        char_list=[u"あ", u"い", u"う", u"え", u"お"],
+        char_list=["あ", "い", "う", "え", "お"],
         outdir=None,
         report_bleu=False,
         sym_space="<space>",
@@ -162,7 +162,7 @@ def test_sortagrad_trainable(module):
     else:
         import espnet.nets.chainer_backend.e2e_mt as m
     batchset = make_batchset(
-        dummy_json, 2, 2 ** 10, 2 ** 10, shortest_first=True, mt=True, iaxis=1, oaxis=0
+        dummy_json, 2, 2**10, 2**10, shortest_first=True, mt=True, iaxis=1, oaxis=0
     )
     model = m.E2E(6, 5, args)
     for batch in batchset:
diff --git a/test/test_e2e_st.py b/test/test_e2e_st.py
index 2f3a244e888..f3e53369128 100644
--- a/test/test_e2e_st.py
+++ b/test/test_e2e_st.py
@@ -55,7 +55,7 @@ def make_arg(**kwargs):
         streaming_onset_margin=2,
         streaming_offset_margin=2,
         verbose=2,
-        char_list=[u"あ", u"い", u"う", u"え", u"お"],
+        char_list=["あ", "い", "う", "え", "お"],
         outdir=None,
         ctc_type="warpctc",
         report_bleu=False,
@@ -303,7 +303,7 @@ def test_gradient_noise_injection(module):
         import espnet.nets.pytorch_backend.e2e_st as m
     else:
         raise NotImplementedError
-    batchset = make_batchset(dummy_json, 2, 2 ** 10, 2 ** 10, shortest_first=True)
+    batchset = make_batchset(dummy_json, 2, 2**10, 2**10, shortest_first=True)
     model = m.E2E(20, 5, args)
     model_org = m.E2E(20, 5, args_org)
     for batch in batchset:
@@ -324,7 +324,7 @@ def test_sortagrad_trainable(module):
         import espnet.nets.pytorch_backend.e2e_st as m
     else:
         raise NotImplementedError
-    batchset = make_batchset(dummy_json, 2, 2 ** 10, 2 ** 10, shortest_first=True)
+    batchset = make_batchset(dummy_json, 2, 2**10, 2**10, shortest_first=True)
     model = m.E2E(20, 5, args)
     for batch in batchset:
         loss = model(*convert_batch(batch, module, idim=20, odim=5))
diff --git a/test/test_initialization.py b/test/test_initialization.py
index 50cd62ce64b..09781eea216 100644
--- a/test/test_initialization.py
+++ b/test/test_initialization.py
@@ -32,7 +32,7 @@
     minlenratio=0.0,
     ctc_weight=0.2,
     verbose=True,
-    char_list=[u"あ", u"い", u"う", u"え", u"お"],
+    char_list=["あ", "い", "う", "え", "お"],
     outdir=None,
     seed=1,
     ctc_type="warpctc",
diff --git a/test/test_positional_encoding.py b/test/test_positional_encoding.py
index 1b0085a3c8b..6637a5245c1 100644
--- a/test/test_positional_encoding.py
+++ b/test/test_positional_encoding.py
@@ -1,6 +1,8 @@
 import pytest
 import torch
 
+
+from espnet.nets.pytorch_backend.transformer.embedding import LearnableFourierPosEnc
 from espnet.nets.pytorch_backend.transformer.embedding import PositionalEncoding
 from espnet.nets.pytorch_backend.transformer.embedding import ScaledPositionalEncoding
 
@@ -35,6 +37,31 @@ def test_pe_extendable(dtype, device):
     assert torch.allclose(y, y2)
 
 
+@pytest.mark.parametrize(
+    "dtype, device, apply_scaling, hidden_dim",
+    [
+        (dt, dv, scal, hd)
+        for dt in ("float32", "float64")
+        for dv in ("cpu", "cuda")
+        for scal in [True, False]
+        for hd in [None, 12]
+    ],
+)
+def test_learnedFourierPe_extendable(dtype, device, apply_scaling, hidden_dim):
+    if device == "cuda" and not torch.cuda.is_available():
+        pytest.skip("no cuda device is available")
+    dtype = getattr(torch, dtype)
+    dim = 2
+    pe = LearnableFourierPosEnc(
+        dim, apply_scaling=apply_scaling, hidden_dim=hidden_dim
+    ).to(dtype=dtype, device=device)
+    x = torch.rand(2, 3, dim, dtype=dtype, device=device)
+    pe(x)
+
+    x = torch.rand(2, 5, dim, dtype=dtype, device=device)
+    pe(x)
+
+
 @pytest.mark.parametrize(
     "dtype, device",
     [(dt, dv) for dt in ("float32", "float64") for dv in ("cpu", "cuda")],
diff --git a/test/test_transform.py b/test/test_transform.py
index 2e6182c5436..a9a8059fd7a 100644
--- a/test/test_transform.py
+++ b/test/test_transform.py
@@ -23,7 +23,7 @@ def test_preprocessing(tmpdir):
     samples = np.random.randn(100, 80)
     stats = np.empty((2, 81), dtype=np.float32)
     stats[0, :80] = samples.sum(axis=0)
-    stats[1, :80] = (samples ** 2).sum(axis=0)
+    stats[1, :80] = (samples**2).sum(axis=0)
     stats[0, -1] = 100.0
     stats[1, -1] = 0.0
     kaldiio.save_mat(cmvn_ark, stats)
diff --git a/test/test_utils.py b/test/test_utils.py
index 435239987ca..7103bd2e012 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -15,12 +15,12 @@ def test_make_batchset(swap_io):
     dummy_json = make_dummy_json(128, [128, 512], [16, 128])
     # check w/o adaptive batch size
     batchset = make_batchset(
-        dummy_json, 24, 2 ** 10, 2 ** 10, min_batch_size=1, swap_io=swap_io
+        dummy_json, 24, 2**10, 2**10, min_batch_size=1, swap_io=swap_io
     )
     assert sum([len(batch) >= 1 for batch in batchset]) == len(batchset)
     print([len(batch) for batch in batchset])
     batchset = make_batchset(
-        dummy_json, 24, 2 ** 10, 2 ** 10, min_batch_size=10, swap_io=swap_io
+        dummy_json, 24, 2**10, 2**10, min_batch_size=10, swap_io=swap_io
     )
     assert sum([len(batch) >= 10 for batch in batchset]) == len(batchset)
     print([len(batch) for batch in batchset])
@@ -44,15 +44,15 @@ def test_sortagrad(swap_io):
         batchset = make_batchset(
             dummy_json,
             16,
-            2 ** 10,
-            2 ** 10,
+            2**10,
+            2**10,
             batch_sort_key="input",
             shortest_first=True,
             swap_io=True,
         )
         key = "output"
     else:
-        batchset = make_batchset(dummy_json, 16, 2 ** 10, 2 ** 10, shortest_first=True)
+        batchset = make_batchset(dummy_json, 16, 2**10, 2**10, shortest_first=True)
         key = "input"
     prev_start_ilen = batchset[0][0][1][key][0]["shape"][0]
     for batch in batchset:
diff --git a/test_utils/test_evaluate_asr.bats b/test_utils/test_evaluate_asr.bats
index 3b8b51da792..4831d409412 100644
--- a/test_utils/test_evaluate_asr.bats
+++ b/test_utils/test_evaluate_asr.bats
@@ -15,7 +15,7 @@ EOF
 
 @test "evaluate_asr" {
     cd egs2/mini_an4/asr1
-    model_tag="kamo-naoyuki/mini_an4_asr_train_raw_bpe_valid.acc.best"
+    model_tag="espnet/kamo-naoyuki-mini_an4_asr_train_raw_bpe_valid.acc.best"
     scripts/utils/evaluate_asr.sh \
         --stop-stage 3 \
         --model_tag "${model_tag}" \
diff --git a/test_utils/test_evaluate_asr_hf.bats b/test_utils/test_evaluate_asr_hf.bats
deleted file mode 100644
index 598455b8529..00000000000
--- a/test_utils/test_evaluate_asr_hf.bats
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/usr/bin/env bats
-
-setup() {
-    tmpdir=/tmp/espnet2-test-evaluate-asr-hf-${RANDOM}
-    # Create dummy data
-    mkdir -p ${tmpdir}/data
-    echo "dummy A" > ${tmpdir}/data/text
-    echo "dummy ${tmpdir}/data/dummy.wav" > ${tmpdir}/data/wav.scp
-    python << EOF
-import numpy as np
-import soundfile as sf
-sf.write("${tmpdir}/data/dummy.wav", np.zeros(16000 * 2,), 16000, "PCM_16")
-EOF
-}
-
-@test "evaluate_asr_hf" {
-    cd egs2/mini_an4/asr1
-    model_tag="espnet/kamo-naoyuki-mini_an4_asr_train_raw_bpe_valid.acc.best"
-    scripts/utils/evaluate_asr.sh \
-        --stop-stage 3 \
-        --model_tag "${model_tag}" \
-        --gt_text "${tmpdir}/data/text" \
-        --inference_args "--beam_size 1" \
-        "${tmpdir}/data/wav.scp" "${tmpdir}/asr_results"
-}
-
-teardown() {
-    rm -r $tmpdir
-}
diff --git a/test_utils/test_scoreintent_py.bats b/test_utils/test_scoreintent_py.bats
old mode 100644
new mode 100755
index 67c60c3e154..3af9b0e09d5
--- a/test_utils/test_scoreintent_py.bats
+++ b/test_utils/test_scoreintent_py.bats
@@ -9,17 +9,17 @@ setup() {
     test_inference_folder=test/
     echo $tmpdir
     cat <<EOF > $tmpdir/valid/score_wer/hyp.trn
-decrease_heat_washroom Turn the temperature down in the bathroom        (7NqqnAOPVVSKnxyv-7NqqnAOPVVSKnxyv_01307c00-4630-11e9-bc65-55b32b211b66.wav)
-decrease_heat_washroom Turn the temperature down in the washroom        (7NqqnAOPVVSKnxyv-7NqqnAOPVVSKnxyv_0157abb0-4633-11e9-bc65-55b32b211b66.wav)
+decrease_heat_washroom Turn the temperature down in the bathroom	(7NqqnAOPVVSKnxyv-7NqqnAOPVVSKnxyv_01307c00-4630-11e9-bc65-55b32b211b66.wav)
+decrease_heat_washroom Turn the temperature down in the washroom	(7NqqnAOPVVSKnxyv-7NqqnAOPVVSKnxyv_0157abb0-4633-11e9-bc65-55b32b211b66.wav)
 EOF
     cp $tmpdir/valid/score_wer/hyp.trn $tmpdir/valid/score_wer/ref.trn
     cat <<EOF > $tmpdir/test/score_wer/hyp.trn
-activate_lights_washroom Lights on in the bathroom      (4BrX8aDqK2cLZRYl-4BrX8aDqK2cLZRYl_00143870-4531-11e9-b1e4-e5985dca719e.wav)
-increase_volume_none Increase the volume        (4BrX8aDqK2cLZRYl-4BrX8aDqK2cLZRYl_00224990-452e-11e9-b1e4-e5985dca719e.wav)
+activate_lights_washroom Lights on in the bathroom	(4BrX8aDqK2cLZRYl-4BrX8aDqK2cLZRYl_00143870-4531-11e9-b1e4-e5985dca719e.wav)
+increase_volume_none Increase the volume	(4BrX8aDqK2cLZRYl-4BrX8aDqK2cLZRYl_00224990-452e-11e9-b1e4-e5985dca719e.wav)
 EOF
     cat <<EOF > $tmpdir/test/score_wer/ref.trn
-activate_lights_none Lights on      (4BrX8aDqK2cLZRYl-4BrX8aDqK2cLZRYl_00143870-4531-11e9-b1e4-e5985dca719e.wav)
-increase_volume_none Increase the volume        (4BrX8aDqK2cLZRYl-4BrX8aDqK2cLZRYl_00224990-452e-11e9-b1e4-e5985dca719e.wav)
+activate_lights_none Lights on	(4BrX8aDqK2cLZRYl-4BrX8aDqK2cLZRYl_00143870-4531-11e9-b1e4-e5985dca719e.wav)
+increase_volume_none Increase the volume	(4BrX8aDqK2cLZRYl-4BrX8aDqK2cLZRYl_00224990-452e-11e9-b1e4-e5985dca719e.wav)
 EOF
     cat << EOF > $tmpdir/result.txt
 Valid Intent Classification Result
diff --git a/tools/Makefile b/tools/Makefile
index f40f6c57d39..cad135421a2 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -30,10 +30,10 @@ all: kaldi showenv python conda_packages.done sctk.done sph2pipe.done check_inst
 
 ifneq ($(strip $(CHAINER_VERSION)),)
 python: activate_python.sh espnet.done pytorch.done chainer.done fairscale.done torch_optimizer.done
-extra: warp-ctc.done warp-transducer.done chainer_ctc.done nkf.done moses.done mwerSegmenter.done pesq kenlm.done pyopenjtalk.done py3mmseg.done beamformit.done fairseq.done s3prl.done k2.done transformers.done phonemizer.done
+extra: warp-ctc.done warp-transducer.done chainer_ctc.done nkf.done moses.done mwerSegmenter.done pesq kenlm.done pyopenjtalk.done py3mmseg.done beamformit.done fairseq.done s3prl.done k2.done transformers.done phonemizer.done longformer.done
 else
 python: activate_python.sh espnet.done pytorch.done fairscale.done torch_optimizer.done
-extra: warp-ctc.done warp-transducer.done nkf.done moses.done mwerSegmenter.done pesq kenlm.done pyopenjtalk.done py3mmseg.done beamformit.done fairseq.done s3prl.done k2.done transformers.done phonemizer.done
+extra: warp-ctc.done warp-transducer.done nkf.done moses.done mwerSegmenter.done pesq kenlm.done pyopenjtalk.done py3mmseg.done beamformit.done fairseq.done s3prl.done k2.done transformers.done phonemizer.done longformer.done
 endif
 
 
@@ -93,8 +93,12 @@ sph2pipe.done:
 
 pytorch.done: activate_python.sh
 ifeq ($(strip $(USE_CONDA)),)
+	# NOTE(kan-bayashi): Temporary fixed numpy version
+	. ./activate_python.sh && pip install "numpy<=1.21.3"
 	. ./activate_python.sh && ./installers/install_torch.sh "false" "${TH_VERSION}" "${CUDA_VERSION}"
 else
+	# NOTE(kan-bayashi): Temporary fixed numpy version
+	. ./activate_python.sh && conda install -y "numpy<=1.21.3"
 	. ./activate_python.sh && ./installers/install_torch.sh "true" "${TH_VERSION}" "${CUDA_VERSION}"
 endif
 	touch pytorch.done
@@ -145,6 +149,10 @@ phonemizer.done: espnet.done conda_packages.done
 	. ./activate_python.sh && ./installers/install_phonemizer.sh
 	touch phonemizer.done
 
+speechbrain.done: espnet.done conda_packages.done
+	. ./activate_python.sh && ./installers/install_speechbrain.sh
+	touch speechbrain.done
+
 moses.done:
 	git clone --depth 1 https://github.com/moses-smt/mosesdecoder.git moses
 	touch moses.done
@@ -197,6 +205,10 @@ transformers.done: espnet.done
 	. ./activate_python.sh && ./installers/install_transformers.sh
 	touch transformers.done
 
+longformer.done: espnet.done
+	. ./activate_python.sh && ./installers/install_longformer.sh
+	touch longformer.done
+
 check_install: python
 	. ./activate_python.sh; . ./extra_path.sh; python3 check_install.py
 
diff --git a/tools/check_install.py b/tools/check_install.py
index 67c0972160c..82081986123 100644
--- a/tools/check_install.py
+++ b/tools/check_install.py
@@ -27,7 +27,11 @@
     ("gtn", None, "installers/install_gtn.sh"),
     ("s3prl", None, "installers/install_s3prl.sh"),
     ("transformers", None, "installers/install_transformers.sh"),
+    ("speechbrain", None, "installers/install_speechbrain.sh"),
     ("k2", None, "installers/install_k2.sh"),
+    ("longformer",None,"installers/install_longformer.sh"),
+    ("nlg-eval",None,"installers/install_longformer.sh"),
+    ("datasets",None,"installers/install_longformer.sh"),
 ]
 
 executable_list = [
diff --git a/tools/installers/install_deepxi.sh b/tools/installers/install_deepxi.sh
new file mode 100755
index 00000000000..43d49f29ace
--- /dev/null
+++ b/tools/installers/install_deepxi.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+#==============================================================================
+# Title: install_deepxi.sh
+# Description: Install everything necessary for deepxi to compile. 
+# Author: Fabian Hörst, based on DeepXi GitHub page
+# Github DeepXi: https://github.com/anicolson/DeepXi
+# Date: 2021-12-04
+# Version : 1.0
+# Usage: bash install_deepxi.sh
+# Python environment: DeepXi Python environment is saved under ~/venv/DeepXi in
+#	  	      your home directory
+#==============================================================================
+
+# Exit script if any command fails
+set -e 
+set -o pipefail
+
+echo "Installing DeepXi"
+
+# If Direcotry exists, pull missing files
+if [ -d "DeepXi" ]; then
+    cd DeepXi
+    git pull https://github.com/anicolson/DeepXi.git
+    cd ..
+# Clone git in current directory, build virtual environment and install requirements
+else
+    git clone https://github.com/anicolson/DeepXi.git	
+fi
+echo "DeepXi installed"
diff --git a/tools/installers/install_gtn.sh b/tools/installers/install_gtn.sh
index d628144c1cd..bc181ed5252 100755
--- a/tools/installers/install_gtn.sh
+++ b/tools/installers/install_gtn.sh
@@ -11,7 +11,7 @@ fi
 if [ ! -e gtn.done ]; then
     (
         set -euo pipefail
-        python3 -m pip install gtn
+        python3 -m pip install gtn==0.0.0
     )
     touch gtn.done
 else
diff --git a/tools/installers/install_longformer.sh b/tools/installers/install_longformer.sh
new file mode 100755
index 00000000000..c942abb0dd9
--- /dev/null
+++ b/tools/installers/install_longformer.sh
@@ -0,0 +1,56 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+if [ $# != 0 ]; then
+    echo "Usage: $0"
+    exit 1;
+fi
+
+torch_version=$(python3 -c "import torch; print(torch.__version__)")
+python_36_plus=$(python3 <<EOF
+from distutils.version import LooseVersion as V
+import sys
+
+if V(sys.version) >= V("3.6"):
+    print("true")
+else:
+    print("false")
+EOF
+)
+pt_plus(){
+    python3 <<EOF
+import sys
+from distutils.version import LooseVersion as L
+if L('$torch_version') >= L('$1'):
+    print("true")
+else:
+    print("false")
+EOF
+}
+
+echo "[INFO] torch_version=${torch_version}"
+
+if ! "${python_36_plus}"; then
+    echo "[ERROR] python<3.6 is not supported"
+    exit 1
+else
+
+    if $(pt_plus 1.8.0); then
+        python -m pip install git+https://github.com/roshansh-cmu/longformer.git
+        python -m pip install datasets bert-score
+        python -m pip install git+https://github.com/Maluuba/nlg-eval.git@master
+    else
+        echo "[WARNING] Longformer requires pytorch>=1.8.*"
+    fi
+
+fi
+
+
+# Check the pytorch version is not changed from the original version
+current_torch_version="$(python3 -c 'import torch; print(torch.__version__)')"
+if [ ${torch_version} != "${current_torch_version}" ]; then
+    echo "[ERROR] The torch version has been changed. Please report to espnet administrators"
+    exit 1
+fi
+
diff --git a/tools/installers/install_openface.sh b/tools/installers/install_openface.sh
new file mode 100755
index 00000000000..8b589ef824e
--- /dev/null
+++ b/tools/installers/install_openface.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+#==============================================================================
+# Title: install_openface.sh
+# Description: Install everything necessary for OpenFace to compile. 
+# Will install all required dependencies, only use if you do not have the dependencies
+# already installed or if you don't mind specific versions of gcc,g++,cmake,opencv etc. installed
+# Author: Fabian Hörst
+# Reference: Thanks to Daniyal Shahrokhian <daniyal@kth.se>, Tadas Baltrusaitis <tadyla@gmail.com>
+#            on which this script is based
+# Github OpenFace: https://github.com/TadasBaltrusaitis/OpenFace
+# Date: 2021-03-30
+# Version : 1.0
+# Usage: bash install.sh, please use just for ubuntu 18.04 or 20.04
+#==============================================================================
+
+# Exit script if any command fails
+set -e 
+set -o pipefail
+
+# Get current directory
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+
+# Check Ubuntu Version
+if [ `lsb_release -d` != "18.04" ] || [ `lsb_release -d` != "20.04" ]; then
+    echo "This script does not support your ubuntu Version. Please install manually. Further informations can be found here:"
+    echo "https://github.com/TadasBaltrusaitis/OpenFace/wiki/Unix-Installation"
+    exit 1
+fi
+
+
+# OpenFace installation
+echo "Downloading OpenFace"
+git clone https://github.com/TadasBaltrusaitis/OpenFace.git
+cd OpenFace
+rm -rf CMakeLists.txt
+cd ../..
+cp CMakeLists.txt installations/OpenFace
+cd installations/OpenFace
+echo "Installing OpenFace..."
+mkdir -p build
+cd build
+cmake -D CMAKE_CXX_COMPILER=g++-8 -D CMAKE_C_COMPILER=gcc-8 -D CMAKE_BUILD_TYPE=RELEASE ..
+make
+
+./download_models.sh
+cp lib/local/LandmarkDetector/model/patch_experts/cen_* build/bin/model/patch_experts/
+
+cd ../..
+echo "OpenFace successfully installed."
+
+
diff --git a/tools/installers/install_pesq.sh b/tools/installers/install_pesq.sh
index 29677c5c32e..5e707e9151d 100755
--- a/tools/installers/install_pesq.sh
+++ b/tools/installers/install_pesq.sh
@@ -9,7 +9,7 @@ fi
 
 if [ ! -e PESQ.zip ]; then
     wget --tries=3 --no-check-certificate \
-        'http://www.itu.int/rec/dologin_pub.asp?lang=e&id=T-REC-P.862-200511-I!Amd2!SOFT-ZST-E&type=items' -O PESQ.zip
+        'https://github.com/LiChenda/itu_pesq/raw/main/T-REC-P.862-200511.zip' -O PESQ.zip
 fi
 if [ ! -e PESQ ]; then
     mkdir -p PESQ_P.862.2
diff --git a/tools/installers/install_pyopenjtalk.sh b/tools/installers/install_pyopenjtalk.sh
index 595024fc97a..1c0e215f9a3 100755
--- a/tools/installers/install_pyopenjtalk.sh
+++ b/tools/installers/install_pyopenjtalk.sh
@@ -13,7 +13,7 @@ if [ ! -e pyopenjtalk.done ]; then
         set -euo pipefail
         # Since this installer overwrite existing pyopenjtalk, remove done file.
         [ -e tdmelodic_pyopenjtalk.done ] && rm tdmelodic_pyopenjtalk.done
-        python3 -m pip install pyopenjtalk==0.1.5
+        python3 -m pip install pyopenjtalk==0.1.6
         python3 -c "import pyopenjtalk; pyopenjtalk.g2p('download dict')"
     )
     touch pyopenjtalk.done
diff --git a/tools/installers/install_speechbrain.sh b/tools/installers/install_speechbrain.sh
new file mode 100755
index 00000000000..b3c2310206e
--- /dev/null
+++ b/tools/installers/install_speechbrain.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+if [ $# != 0 ]; then
+    echo "Usage: $0"
+    exit 1;
+fi
+
+torch_18_plus=$(python3 <<EOF
+from distutils.version import LooseVersion as V
+import torch
+
+if V(torch.__version__) >= V("1.8"):
+    print("true")
+else:
+    print("false")
+EOF
+)
+
+
+# Install speechbrain
+if [ ! -e speechbrain.done ]; then
+    if "${torch_18_plus}"; then
+        python3 -m pip install speechbrain==0.5.11
+        touch speechbrain.done
+    else
+        echo "[ERROR]: speechbrain requires pytorch>=1.8.0"
+        exit 1
+    fi
+else
+    echo "speechbrain is already installed."
+fi
diff --git a/tools/installers/install_tdmelodic_pyopenjtalk.sh b/tools/installers/install_tdmelodic_pyopenjtalk.sh
index 2cad7e64415..d9af1dba667 100755
--- a/tools/installers/install_tdmelodic_pyopenjtalk.sh
+++ b/tools/installers/install_tdmelodic_pyopenjtalk.sh
@@ -16,9 +16,9 @@ if [ ! -e tdmelodic_pyopenjtalk.done ]; then
         # TODO(kan-bayashi): Better to fix tagged version
         #   commit id when creating PR: 766477584a423a1e62b0f81f79fb7e5e189962b5
         rm -rf tdmelodic_openjtalk && git clone https://github.com/sarulab-speech/tdmelodic_openjtalk.git
-        rm -rf pyopenjtalk && git clone https://github.com/r9y9/pyopenjtalk.git -b v0.1.5
+        rm -rf pyopenjtalk && git clone https://github.com/r9y9/pyopenjtalk.git -b v0.1.6
         cd pyopenjtalk
-        git switch -c v0.1.5
+        git switch -c v0.1.6
         git submodule update --recursive --init
 
         # concatenate the dictionary
diff --git a/tools/installers/install_vidaug.sh b/tools/installers/install_vidaug.sh
new file mode 100755
index 00000000000..71b27a95b2a
--- /dev/null
+++ b/tools/installers/install_vidaug.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+#==============================================================================
+# Title: install_espnet.sh
+# Description: Install everything necessary for ESPnet to compile. 
+# Will install all required dependencies, only use if you do not have the dependencies
+# Author: Fabian Hörst
+# Github Vidaug: https://github.com/okankop/vidaug
+# Date: 2021-07-19
+# Version : 1.0
+# Usage: bash install_vidaug.sh PATH_TO_ESPNET_MAIN FOLDER, please use just for ubuntu 18.04 or 20.04
+#==============================================================================
+
+# Get ESPNET Path, e.g. "/home/fabian/AVSR/espnet" from parameter handover
+ESPNET=$1 
+. "${ESPNET}"/tools/activate_python.sh
+
+# Install required packages
+pip3 install numpy
+pip3 install scipy
+pip3 install scikit-image
+pip3 install pillow
+
+git clone https://github.com/okankop/vidaug
+cd vidaug
+python3 setup.py sdist && pip3 install dist/vidaug-0.1.tar.gz
+
diff --git a/utils/compute-cmvn-stats.py b/utils/compute-cmvn-stats.py
index be5e7595777..067daec6cfb 100755
--- a/utils/compute-cmvn-stats.py
+++ b/utils/compute-cmvn-stats.py
@@ -146,7 +146,7 @@ def utt2spk(x):
 
         counts[spk] += matrix.shape[0]
         sum_feats[spk] += matrix.sum(axis=0)
-        square_sum_feats[spk] += (matrix ** 2).sum(axis=0)
+        square_sum_feats[spk] += (matrix**2).sum(axis=0)
     logging.info("Processed {} utterances".format(idx))
     assert idx > 0, idx
 
diff --git a/utils/convert_fbank_to_wav.py b/utils/convert_fbank_to_wav.py
index 4f0726e9ee7..e38feb90593 100755
--- a/utils/convert_fbank_to_wav.py
+++ b/utils/convert_fbank_to_wav.py
@@ -39,7 +39,9 @@ def logmelspc_to_linearspc(lmspc, fs, n_mels, n_fft, fmin=None, fmax=None):
     fmin = 0 if fmin is None else fmin
     fmax = fs / 2 if fmax is None else fmax
     mspc = np.power(10.0, lmspc)
-    mel_basis = librosa.filters.mel(fs, n_fft, n_mels, fmin, fmax)
+    mel_basis = librosa.filters.mel(
+        sr=fs, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax
+    )
     inv_mel_basis = np.linalg.pinv(mel_basis)
     spc = np.maximum(EPS, np.dot(inv_mel_basis, mspc.T).T)
 
diff --git a/utils/eval_perm_free_error.py b/utils/eval_perm_free_error.py
index d0c9910bf55..2f1b15132b2 100755
--- a/utils/eval_perm_free_error.py
+++ b/utils/eval_perm_free_error.py
@@ -178,7 +178,7 @@ def main():
     parser = get_parser()
     args = parser.parse_args()
 
-    if len(args.results) != args.num_spkrs ** 2:
+    if len(args.results) != args.num_spkrs**2:
         parser.print_help()
         sys.exit(1)
 
diff --git a/utils/trim_silence.py b/utils/trim_silence.py
index 2de630be40d..d448438a4b6 100755
--- a/utils/trim_silence.py
+++ b/utils/trim_silence.py
@@ -20,7 +20,7 @@
 
 
 def _time_to_str(time_idx):
-    time_idx = time_idx * 10 ** 4
+    time_idx = time_idx * 10**4
     return "%06d" % time_idx