diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
new file mode 100644
index 00000000000..049bdabafa1
--- /dev/null
+++ b/.github/workflows/docker.yml
@@ -0,0 +1,48 @@
+name: docker-builder
+
+on:
+  pull_request:
+    types: [closed]
+    branches:
+      - master
+    paths:
+      - 'tools/**'
+      - setup.py
+
+jobs:
+  docker:
+    runs-on: ubuntu-latest
+    if: github.event.pull_request.merged == true
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v1
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v1
+
+      - name: Login to DockerHub
+        uses: docker/login-action@v1 
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+      
+      - name: Build and push CPU container
+        run: |
+          cd docker
+          docker build --build-arg FROM_TAG=runtime-latest \
+            -f prebuilt/devel.dockerfile \
+            --target devel \
+            -t espnet/espnet:cpu-latest .
+          docker push espnet/espnet:cpu-latest   
+
+      - name: Build and push GPU container
+        run: |
+          cd docker
+          docker build --build-arg FROM_TAG=cuda-latest \
+            --build-arg CUDA_VER=11.1 \
+            -f prebuilt/devel.dockerfile \
+            --target devel \
+            -t espnet/espnet:gpu-latest .
+          docker push espnet/espnet:gpu-latest
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 979e7397012..9036a09b66d 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -151,6 +151,11 @@ we recommend using small model parameters and avoiding dynamic imports, file acc
 more running time, you can annotate your test with `@pytest.mark.execution_timeout(sec)`.
 - For test initialization (parameters, modules, etc), you can use pytest fixtures. Refer to  [pytest fixtures](https://docs.pytest.org/en/latest/fixture.html#using-fixtures-from-classes-modules-or-projects) for more information.
 
+In addition, please follow the [PEP 8 convention](https://peps.python.org/pep-0008/) for the coding style and [Google's convention for docstrings](https://google.github.io/styleguide/pyguide.html#383-functions-and-methods).
+Below are some specific points that should be taken care of in particular:
+- [import ordering](https://peps.python.org/pep-0008/#imports)
+- Avoid writing python2-style code. For example, `super().__init__()` is preferred over `super(CLASS_NAME, self).__init()__`.
+
 
 ### 4.2 Bash scripts
 
diff --git a/README.md b/README.md
index 082e5450f78..678c52103f5 100644
--- a/README.md
+++ b/README.md
@@ -77,12 +77,12 @@ ESPnet uses [pytorch](http://pytorch.org/) as a deep learning engine and also fo
 - Self-supervised learning representations as features, using upstream models in [S3PRL](https://github.com/s3prl/s3prl) in frontend.
   - Set `frontend` to be `s3prl`
   - Select any upstream model by setting the `frontend_conf` to the corresponding name.
+- Transfer Learning : 
+  - easy usage and transfers from models previously trained by your group, or models from [ESPnet huggingface repository](https://huggingface.co/espnet).
+  - [Documentation](https://github.com/espnet/espnet/tree/master/egs2/mini_an4/asr1/transfer_learning.md) and [toy example runnable on colab](https://github.com/espnet/notebook/blob/master/espnet2_asr_transfer_learning_demo.ipynb).
 - Streaming Transformer/Conformer ASR with blockwise synchronous beam search.
 - Restricted Self-Attention based on [Longformer](https://arxiv.org/abs/2004.05150) as an encoder for long sequences 
 
-### SUM: Speech Summarization
-- End to End Speech Summarization Recipe for Instructional Videos using Restricted Self-Attention [[Sharma et al., 2022]](https://arxiv.org/abs/2110.06263)
-
 Demonstration
 - Real-time ASR demo with ESPnet2  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/espnet/notebook/blob/master/espnet2_asr_realtime_demo.ipynb)
 - [Gradio](https://github.com/gradio-app/gradio) Web Demo on [Huggingface Spaces](https://huggingface.co/docs/hub/spaces). Check out the [Web Demo](https://huggingface.co/spaces/akhaliq/espnet2_asr)
@@ -133,7 +133,7 @@ To train the neural vocoder, please check the following repositories:
 - Multi-speaker speech separation
 - Unified encoder-separator-decoder structure for time-domain and frequency-domain models
   - Encoder/Decoder: STFT/iSTFT, Convolution/Transposed-Convolution
-  - Separators: BLSTM, Transformer, Conformer, [TasNet](https://arxiv.org/abs/1809.07454), [DPRNN](https://arxiv.org/abs/1910.06379), [DC-CRN](https://web.cse.ohio-state.edu/~wang.77/papers/TZW.taslp21.pdf), [DCCRN](https://arxiv.org/abs/2008.00264), Neural Beamformers, etc.
+  - Separators: BLSTM, Transformer, Conformer, [TasNet](https://arxiv.org/abs/1809.07454), [DPRNN](https://arxiv.org/abs/1910.06379), [SkiM](https://arxiv.org/abs/2201.10800), [SVoice](https://arxiv.org/abs/2011.02329), [DC-CRN](https://web.cse.ohio-state.edu/~wang.77/papers/TZW.taslp21.pdf), [DCCRN](https://arxiv.org/abs/2008.00264), [Deep Clustering](https://ieeexplore.ieee.org/document/7471631), [Deep Attractor Network](https://pubmed.ncbi.nlm.nih.gov/29430212/), [FaSNet](https://arxiv.org/abs/1909.13387), [iFaSNet](https://arxiv.org/abs/1910.14104), Neural Beamformers, etc.
 - Flexible ASR integration: working as an individual task or as the ASR frontend
 - Easy to import pretrained models from [Asteroid](https://github.com/asteroid-team/asteroid)
   - Both the pre-trained models from Asteroid and the specific configuration are supported.
@@ -141,7 +141,6 @@ To train the neural vocoder, please check the following repositories:
 Demonstration
 - Interactive SE demo with ESPnet2 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1fjRJCh96SoYLZPRxsjF9VDv4Q2VoIckI?usp=sharing)
 
-
 ### ST: Speech Translation & MT: Machine Translation
 - **State-of-the-art performance** in several ST benchmarks (comparable/superior to cascaded ASR and MT)
 - Transformer based end-to-end ST (new!)
@@ -152,9 +151,34 @@ Demonstration
 - End-to-end VC based on cascaded ASR+TTS (Baseline system for Voice Conversion Challenge 2020!)
 
 ### SLU: Speech Language Understanding
-- Predicting intent by directly classifying it as one of intent or decoding by character
-- Transformer & RNN based encoder-decoder model
-- Establish SOTA results with spectral augmentation (Performs better than reported results of pretrained model on Fluent Speech Command Dataset)
+- Architecture
+    - Transformer based Encoder
+    - Conformer based Encoder
+    - RNN based Decoder
+    - Transformer based Decoder
+- Support Multitasking with ASR
+    - Predict both intent and ASR transcript
+- Support Multitasking with NLU
+    - Deliberation encoder based 2 pass model
+- Support using pretrained ASR models
+    - Hubert
+    - Wav2vec2
+    - VQ-APC
+    - TERA and more ...
+- Support using pretrained NLP models
+    - BERT
+    - MPNet And more...
+- Various language support
+    - En / Jp / Zn / Nl / And more...
+- Supports using context from previous utterances
+- Supports using other tasks like SE in pipeline manner
+Demonstration
+- Performing noisy spoken language understanding using speech enhancement model followed by spoken language understanding model.  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/14nCrJ05vJcQX0cJuXjbMVFWUHJ3Wfb6N?usp=sharing)
+- Integrated to [Huggingface Spaces](https://huggingface.co/spaces) with [Gradio](https://github.com/gradio-app/gradio). See SLU demo on multiple languages: [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/Siddhant/ESPnet2-SLU)
+
+
+### SUM: Speech Summarization
+- End to End Speech Summarization Recipe for Instructional Videos using Restricted Self-Attention [[Sharma et al., 2022]](https://arxiv.org/abs/2110.06263)
 
 ### DNN Framework
 - Flexible network architecture thanks to chainer and pytorch
@@ -532,11 +556,33 @@ You can download converted samples of the cascade ASR+TTS baseline system [here]
 
 ### SLU results
 
-<details><summary>ESPnet2</summary><div>
+<details><summary>expand</summary><div>
+
+
+We list the performance on various SLU tasks and dataset using the metric reported in the original dataset paper
+
+| Task                                                              | Dataset                                                              |    Metric     |     Result     |                                                                              Pretrained Model                                         |
+| ----------------------------------------------------------------- | :-------------: | :-------------: | :-------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| Intent Classification                                                 |     SLURP     |       Acc       |       86.3       |                [link](https://github.com/espnet/espnet/tree/master/egs2/slurp/asr1/README.md)                |
+| Intent Classification                                                   |     FSC     |       Acc       |       99.6       |                [link](https://github.com/espnet/espnet/tree/master/egs2/fsc/asr1/README.md)                |
+| Intent Classification                                                  |     FSC Unseen Speaker Set     |       Acc       |       98.6       |                [link](https://github.com/espnet/espnet/tree/master/egs2/fsc_unseen/asr1/README.md)                |
+| Intent Classification                                                   |     FSC Unseen Utterance Set     |       Acc       |       86.4       |                [link](https://github.com/espnet/espnet/tree/master/egs2/fsc_unseen/asr1/README.md)                |
+| Intent Classification                                                   |     FSC Challenge Speaker Set     |       Acc       |       97.5       |                [link](https://github.com/espnet/espnet/tree/master/egs2/fsc_challenge/asr1/README.md)                |
+| Intent Classification                                                   |     FSC Challenge Utterance Set     |       Acc       |       78.5       |                [link](https://github.com/espnet/espnet/tree/master/egs2/fsc_challenge/asr1/README.md)                |
+| Intent Classification                                                   |     SNIPS     |       F1       |       91.7       |                [link](https://github.com/espnet/espnet/tree/master/egs2/snips/asr1/README.md)                |
+| Intent Classification                                                   |     Grabo (Nl)   |       Acc       |       97.2       |                [link](https://github.com/espnet/espnet/tree/master/egs2/grabo/asr1/README.md)                |
+| Intent Classification                                                   |     CAT SLU MAP (Zn)     |       Acc       |       78.9       |                [link](https://github.com/espnet/espnet/tree/master/egs2/catslu/asr1/README.md)                |
+| Intent Classification                                                  |     Google Speech Commands    |       Acc       |       98.4       |                [link](https://github.com/espnet/espnet/tree/master/egs2/speechcommands/asr1/README.md)                |
+| Slot Filling                                                  |     SLURP     |       SLU-F1       |       71.9       |                [link](https://github.com/espnet/espnet/tree/master/egs2/slurp_entity/asr1/README.md)                |
+| Dialogue  Act Classification                                                 |     Switchboard     |       Acc       |       67.5       |                [link](https://github.com/espnet/espnet/tree/master/egs2/swbd_da/asr1/README.md)                |
+| Dialogue  Act Classification                                                 |     Jdcinal (Jp)    |       Acc       |       67.4       |                [link](https://github.com/espnet/espnet/tree/master/egs2/jdcinal/asr1/README.md)                |
+| Emotion Recognition                                                  |     IEMOCAP     |       Acc       |       69.4       |                [link](https://github.com/espnet/espnet/tree/master/egs2/iemocap/asr1/README.md)                |
+| Emotion Recognition                                                  |     swbd_sentiment     |       Macro F1       |       61.4       |                [link](https://github.com/espnet/espnet/tree/master/egs2/swbd_sentiment/asr1/README.md)                | 
+| Emotion Recognition                                                  |     slue_voxceleb     |       Macro F1       |       44.0       |                [link](https://github.com/espnet/espnet/tree/master/egs2/slue-voxceleb/asr1/README.md)                | 
 
-- Transformer based SLU for Fluent Speech Command Dataset
+ 
+If you want to check the results of the other recipes, please check `egs2/<name_of_recipe>/asr1/RESULTS.md`.
 
-In SLU, The objective is to infer the meaning or intent of spoken utterance. The [Fluent Speech Command Dataset](https://fluent.ai/fluent-speech-commands-a-dataset-for-spoken-language-understanding-research/) describes an intent as combination of 3 slot values: action, object and location. You can see baseline results on this dataset [here](https://github.com/espnet/espnet/blob/master/egs2/fsc/asr1/RESULTS.md)
 
 
 </div></details>
@@ -689,6 +735,8 @@ See the module documentation for more information.
 It is recommended to use models with RNN-based encoders (such as BLSTMP) for aligning large audio files;
 rather than using Transformer models that have a high memory consumption on longer audio data.
 The sample rate of the audio must be consistent with that of the data used in training; adjust with `sox` if needed.
+  
+Also, we can use this tool to provide token-level segmentation information if we prepare a list of tokens instead of that of utterances in the `text` file. See the discussion in https://github.com/espnet/espnet/issues/4278#issuecomment-1100756463.
 
 </div></details>
 
diff --git a/ci/test_integration_espnet2.sh b/ci/test_integration_espnet2.sh
index 78086272af7..58951c04011 100755
--- a/ci/test_integration_espnet2.sh
+++ b/ci/test_integration_espnet2.sh
@@ -100,6 +100,50 @@ if python3 -c "import fairseq" &> /dev/null; then
     cd "${cwd}"
 fi
 
+# [ESPnet2] test enh_asr1 recipe
+if python -c 'import torch as t; from distutils.version import LooseVersion as L; assert L(t.__version__) >= L("1.2.0")' &> /dev/null;  then
+    cd ./egs2/mini_an4/enh_asr1
+    echo "==== [ESPnet2] ENH_ASR ==="
+    ./run.sh --ngpu 0 --stage 0 --stop-stage 15 --skip-upload_hf false --feats-type "raw" --spk-num 1 --enh_asr_args "--max_epoch=1 --enh_separator_conf num_spk=1" --python "${python}"
+    # Remove generated files in order to reduce the disk usage
+    rm -rf exp dump data
+    cd "${cwd}"
+fi
+
+# [ESPnet2] test st recipe
+cd ./egs2/mini_an4/st1
+echo "==== [ESPnet2] ST ==="
+./run.sh --stage 1 --stop-stage 1
+feats_types="raw fbank_pitch"
+token_types="bpe char"
+for t in ${feats_types}; do
+    ./run.sh --stage 2 --stop-stage 4 --feats-type "${t}" --python "${python}"
+done
+for t in ${token_types}; do
+    ./run.sh --stage 5 --stop-stage 5 --tgt_token_type "${t}" --src_token_type "${t}" --python "${python}"
+done
+for t in ${feats_types}; do 
+    for t2 in ${token_types}; do
+        echo "==== feats_type=${t}, token_types=${t2} ==="
+        ./run.sh --ngpu 0 --stage 6 --stop-stage 13 --skip-upload false --feats-type "${t}" --tgt_token_type "${t2}" --src_token_type "${t2}" \
+            --st-args "--max_epoch=1" --lm-args "--max_epoch=1" --inference_args "--beam_size 5" --python "${python}"
+    done
+done
+echo "==== feats_type=raw, token_types=bpe, model_conf.extract_feats_in_collect_stats=False, normalize=utt_mvn ==="
+./run.sh --ngpu 0 --stage 10 --stop-stage 13 --skip-upload false --feats-type "raw" --tgt_token_type "bpe" --src_token_type "bpe" \
+    --feats_normalize "utterance_mvn" --lm-args "--max_epoch=1" --inference_args "--beam_size 5" --python "${python}" \
+    --st-args "--model_conf extract_feats_in_collect_stats=false --max_epoch=1"
+
+echo "==== use_streaming, feats_type=raw, token_types=bpe, model_conf.extract_feats_in_collect_stats=False, normalize=utt_mvn ==="
+./run.sh --use_streaming true --ngpu 0 --stage 6 --stop-stage 13 --skip-upload false --feats-type "raw" --tgt_token_type "bpe" --src_token_type "bpe" \
+    --feats_normalize "utterance_mvn" --lm-args "--max_epoch=1" --inference_args "--beam_size 5" --python "${python}" \
+    --st-args "--model_conf extract_feats_in_collect_stats=false --max_epoch=1 --encoder=contextual_block_transformer --decoder=transformer
+                --encoder_conf block_size=40 --encoder_conf hop_size=16 --encoder_conf look_ahead=16"
+
+# Remove generated files in order to reduce the disk usage
+rm -rf exp dump data
+cd "${cwd}"
+
 # [ESPnet2] Validate configuration files
 echo "<blank>" > dummy_token_list
 echo "==== [ESPnet2] Validation configuration files ==="
@@ -124,6 +168,9 @@ if python3 -c 'import torch as t; from distutils.version import LooseVersion as
     for f in egs2/*/ssl1/conf/train*.yaml; do
         ${python} -m espnet2.bin.hubert_train --config "${f}" --iterator_type none --normalize none --dry_run true --output_dir out --token_list dummy_token_list
     done
+    for f in egs2/*/enh_asr1/conf/train_enh_asr*.yaml; do
+        ${python} -m espnet2.bin.enh_s2t_train --config "${f}" --iterator_type none --dry_run true --output_dir out --token_list dummy_token_list
+    done
 fi
 
 # These files must be same each other.
diff --git a/egs/README.md b/egs/README.md
index 61951b84d47..78fa57049ae 100755
--- a/egs/README.md
+++ b/egs/README.md
@@ -8,6 +8,7 @@ See: https://espnet.github.io/espnet/tutorial.html
 | Directory name          | Corpus name                                                  | Task                                       | Language       | URL                                                          | Note                          |
 | ----------------------- | ------------------------------------------------------------ | ------------------------------------------ | -------------- | ------------------------------------------------------------ | ----------------------------- |
 ||||
+| aesrc2020               | Accented English Speech Recognition Challenge 2020           | ASR                                        | EN             | https://arxiv.org/abs/2102.10233                                   |                               |
 | aidatatang_200zh        | Aidatatang_200zh A free Chinese Mandarin speech corpus       | ASR                                        | ZH             | http://www.openslr.org/62/                                   |                               |
 | aishell                 | AISHELL-ASR0009-OS1 Open Source Mandarin Speech Corpus       | ASR                                        | ZH             | http://www.aishelltech.com/kysjcp                            |                               |
 | aishell2                | AISHELL-2 Open Source Mandarin Speech Corpus                 | ASR                                        | ZH             | http://www.aishelltech.com/aishell_2                                                         |
@@ -49,7 +50,8 @@ See: https://espnet.github.io/espnet/tutorial.html
 | librispeech             | LibriSpeech ASR corpus                                       | ASR                                        | EN             | http://www.openslr.org/12                                    |                               |
 | libritts                | LibriTTS: A Corpus Derived from LibriSpeech for Text-to-Speech | TTS                                      | EN             | http://www.openslr.org/60/                                   |                               |
 | ljspeech                | The LJ Speech Dataset                                        | TTS                                        | EN             | https://keithito.com/LJ-Speech-Dataset/                      |                               |
-| lrs                     | The Lip Reading Sentences Dataset                            | ASR/AVSR                                       | EN             | https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html                      |                               |
+| lrs2                     | The Lip Reading Sentences 2 Dataset                            | ASR                                       | ENG             | https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html                      |                               |
+| lrs                     | The Lip Reading Sentences 2 and 3 Dataset                            | AVSR                                       | ENG             | https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html  https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs3.html                     |                               |
 | m_ailabs                | The M-AILABS Speech Dataset                                  | TTS                                        | ~5 languages   | https://www.caito.de/2019/01/the-m-ailabs-speech-dataset/    |
 | mucs_2021               | MUCS 2021: MUltilingual and Code-Switching ASR Challenges for Low Resource Indian Languages   | ASR/Code Switching          | HI, MR, OR, TA, TE, GU, HI-EN, BN-EN | https://navana-tech.github.io/MUCS2021/data.html                    |                               |
 | mtedx                   | Multilingual TEDx | ASR/Machine Translation/Speech Translation | 13 Language pairs | http://www.openslr.org/100/                         |
diff --git a/egs/aesrc2020/asr1/RESULTS.md b/egs/aesrc2020/asr1/RESULTS.md
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs/lrs/asr1/cmd.sh b/egs/aesrc2020/asr1/cmd.sh
similarity index 100%
rename from egs/lrs/asr1/cmd.sh
rename to egs/aesrc2020/asr1/cmd.sh
diff --git a/egs/aesrc2020/asr1/conf/decode.yaml b/egs/aesrc2020/asr1/conf/decode.yaml
new file mode 120000
index 00000000000..1f358f011d4
--- /dev/null
+++ b/egs/aesrc2020/asr1/conf/decode.yaml
@@ -0,0 +1 @@
+tuning/decode_pytorch_transformer.yaml
\ No newline at end of file
diff --git a/egs/lrs/asr1/conf/fbank.conf b/egs/aesrc2020/asr1/conf/fbank.conf
similarity index 100%
rename from egs/lrs/asr1/conf/fbank.conf
rename to egs/aesrc2020/asr1/conf/fbank.conf
diff --git a/egs/lrs/asr1/conf/gpu.conf b/egs/aesrc2020/asr1/conf/gpu.conf
similarity index 100%
rename from egs/lrs/asr1/conf/gpu.conf
rename to egs/aesrc2020/asr1/conf/gpu.conf
diff --git a/egs/aesrc2020/asr1/conf/lm.yaml b/egs/aesrc2020/asr1/conf/lm.yaml
new file mode 100644
index 00000000000..ea738c16807
--- /dev/null
+++ b/egs/aesrc2020/asr1/conf/lm.yaml
@@ -0,0 +1,8 @@
+# rnnlm related
+layer: 2
+unit: 650
+opt: sgd        # or adam
+batchsize: 64   # batch size in LM training
+epoch: 20      # if the data size is large, we can reduce this
+patience: 3
+maxlen: 100     # if sentence length > lm_maxlen, lm_batchsize is automatically reduced
diff --git a/egs/lrs/asr1/conf/pitch.conf b/egs/aesrc2020/asr1/conf/pitch.conf
similarity index 100%
rename from egs/lrs/asr1/conf/pitch.conf
rename to egs/aesrc2020/asr1/conf/pitch.conf
diff --git a/egs/lrs/asr1/conf/queue.conf b/egs/aesrc2020/asr1/conf/queue.conf
similarity index 100%
rename from egs/lrs/asr1/conf/queue.conf
rename to egs/aesrc2020/asr1/conf/queue.conf
diff --git a/egs/lrs/asr1/conf/slurm.conf b/egs/aesrc2020/asr1/conf/slurm.conf
similarity index 100%
rename from egs/lrs/asr1/conf/slurm.conf
rename to egs/aesrc2020/asr1/conf/slurm.conf
diff --git a/egs/aesrc2020/asr1/conf/specaug.yaml b/egs/aesrc2020/asr1/conf/specaug.yaml
new file mode 100644
index 00000000000..3351630d2f3
--- /dev/null
+++ b/egs/aesrc2020/asr1/conf/specaug.yaml
@@ -0,0 +1,16 @@
+process:
+  # these three processes are a.k.a. SpecAugument
+  - type: "time_warp"
+    max_time_warp: 5
+    inplace: true
+    mode: "PIL"
+  - type: "freq_mask"
+    F: 30
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+  - type: "time_mask"
+    T: 40
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
\ No newline at end of file
diff --git a/egs/aesrc2020/asr1/conf/train.yaml b/egs/aesrc2020/asr1/conf/train.yaml
new file mode 120000
index 00000000000..5e11a9c3db2
--- /dev/null
+++ b/egs/aesrc2020/asr1/conf/train.yaml
@@ -0,0 +1 @@
+tuning/train_pytorch_conformer_kernel15.yaml
\ No newline at end of file
diff --git a/egs/aesrc2020/asr1/conf/tuning/decode_pytorch_transformer.yaml b/egs/aesrc2020/asr1/conf/tuning/decode_pytorch_transformer.yaml
new file mode 100644
index 00000000000..2ece5128686
--- /dev/null
+++ b/egs/aesrc2020/asr1/conf/tuning/decode_pytorch_transformer.yaml
@@ -0,0 +1,8 @@
+batchsize: 0
+beam-size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc-weight: 0.5
+lm-weight: 0.3
+ngram-weight: 0.3
diff --git a/egs/aesrc2020/asr1/conf/tuning/decode_rnn.yaml b/egs/aesrc2020/asr1/conf/tuning/decode_rnn.yaml
new file mode 100644
index 00000000000..739044dce1a
--- /dev/null
+++ b/egs/aesrc2020/asr1/conf/tuning/decode_rnn.yaml
@@ -0,0 +1,6 @@
+beam-size: 20
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc-weight: 0.6
+lm-weight: 0.3
diff --git a/egs/aesrc2020/asr1/conf/tuning/train_pytorch_conformer_kernel15.yaml b/egs/aesrc2020/asr1/conf/tuning/train_pytorch_conformer_kernel15.yaml
new file mode 100644
index 00000000000..8769ba67139
--- /dev/null
+++ b/egs/aesrc2020/asr1/conf/tuning/train_pytorch_conformer_kernel15.yaml
@@ -0,0 +1,47 @@
+# network architecture
+# encoder related
+elayers: 12
+eunits: 2048
+# decoder related
+dlayers: 6
+dunits: 2048
+# attention related
+adim: 256
+aheads: 4
+
+# hybrid CTC/attention
+mtlalpha: 0.3
+
+# label smoothing
+lsm-weight: 0.1
+
+# minibatch related
+batch-size: 32
+maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
+
+# optimization related
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+opt: noam
+accum-grad: 2
+grad-clip: 5
+patience: 0
+epochs: 50
+dropout-rate: 0.1
+
+# transformer specific setting
+backend: pytorch
+model-module: "espnet.nets.pytorch_backend.e2e_asr_conformer:E2E"
+transformer-input-layer: conv2d     # encoder architecture type
+transformer-lr: 1.0
+transformer-warmup-steps: 25000
+transformer-attn-dropout-rate: 0.0
+transformer-length-normalized-loss: false
+transformer-init: pytorch
+
+# conformer specific setting
+transformer-encoder-pos-enc-layer-type: rel_pos
+transformer-encoder-selfattn-layer-type: rel_selfattn
+macaron-style: true
+use-cnn-module: true
+cnn-module-kernel: 15
diff --git a/egs/aesrc2020/asr1/conf/tuning/train_pytorch_conformer_kernel31.yaml b/egs/aesrc2020/asr1/conf/tuning/train_pytorch_conformer_kernel31.yaml
new file mode 100644
index 00000000000..50d44abb5ab
--- /dev/null
+++ b/egs/aesrc2020/asr1/conf/tuning/train_pytorch_conformer_kernel31.yaml
@@ -0,0 +1,47 @@
+# network architecture
+# encoder related
+elayers: 12
+eunits: 2048
+# decoder related
+dlayers: 6
+dunits: 2048
+# attention related
+adim: 256
+aheads: 4
+
+# hybrid CTC/attention
+mtlalpha: 0.3
+
+# label smoothing
+lsm-weight: 0.1
+
+# minibatch related
+batch-size: 32
+maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
+
+# optimization related
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+opt: noam
+accum-grad: 2
+grad-clip: 5
+patience: 0
+epochs: 50
+dropout-rate: 0.1
+
+# transformer specific setting
+backend: pytorch
+model-module: "espnet.nets.pytorch_backend.e2e_asr_conformer:E2E"
+transformer-input-layer: conv2d     # encoder architecture type
+transformer-lr: 1.0
+transformer-warmup-steps: 25000
+transformer-attn-dropout-rate: 0.0
+transformer-length-normalized-loss: false
+transformer-init: pytorch
+
+# conformer specific setting
+transformer-encoder-pos-enc-layer-type: rel_pos
+transformer-encoder-selfattn-layer-type: rel_selfattn
+macaron-style: true
+use-cnn-module: true
+cnn-module-kernel: 31
diff --git a/egs/aesrc2020/asr1/conf/tuning/train_pytorch_transformer.yaml b/egs/aesrc2020/asr1/conf/tuning/train_pytorch_transformer.yaml
new file mode 100644
index 00000000000..4dd0b4e8247
--- /dev/null
+++ b/egs/aesrc2020/asr1/conf/tuning/train_pytorch_transformer.yaml
@@ -0,0 +1,40 @@
+# network architecture
+# encoder related
+elayers: 12
+eunits: 2048
+# decoder related
+dlayers: 6
+dunits: 2048
+# attention related
+adim: 256
+aheads: 4
+
+# hybrid CTC/attention
+mtlalpha: 0.3
+
+# label smoothing
+lsm-weight: 0.1
+
+# minibatch related
+batch-size: 32
+maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
+
+# optimization related
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+opt: noam
+accum-grad: 2
+grad-clip: 5
+patience: 0
+epochs: 50
+dropout-rate: 0.1
+
+# transformer specific setting
+backend: pytorch
+model-module: "espnet.nets.pytorch_backend.e2e_asr_transformer:E2E"
+transformer-input-layer: conv2d     # encoder architecture type
+transformer-lr: 2.0
+transformer-warmup-steps: 25000
+transformer-attn-dropout-rate: 0.0
+transformer-length-normalized-loss: false
+transformer-init: pytorch
diff --git a/egs/aesrc2020/asr1/conf/tuning/train_rnn.yaml b/egs/aesrc2020/asr1/conf/tuning/train_rnn.yaml
new file mode 100644
index 00000000000..ca5e99fa320
--- /dev/null
+++ b/egs/aesrc2020/asr1/conf/tuning/train_rnn.yaml
@@ -0,0 +1,31 @@
+# network architecture
+# encoder related
+etype: vggblstm     # encoder architecture type
+elayers: 3
+eunits: 1024
+eprojs: 1024
+subsample: "1_2_2_1_1" # skip every n frame from input to nth layers
+# decoder related
+dlayers: 2
+dunits: 1024
+# attention related
+atype: location
+adim: 1024
+aconv-chans: 10
+aconv-filts: 100
+
+# hybrid CTC/attention
+mtlalpha: 0.5
+
+# minibatch related
+batch-size: 30
+maxlen-in: 800  # if input length  > maxlen_in, batchsize is automatically reduced
+maxlen-out: 150 # if output length > maxlen_out, batchsize is automatically reduced
+
+# optimization related
+opt: adadelta
+epochs: 10
+patience: 0
+
+# scheduled sampling option
+sampling-probability: 0.0
diff --git a/egs/aesrc2020/asr1/local/create_subsets.sh b/egs/aesrc2020/asr1/local/create_subsets.sh
new file mode 100755
index 00000000000..f2667260c7b
--- /dev/null
+++ b/egs/aesrc2020/asr1/local/create_subsets.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+
+data=$1     # data transformed into kaldi format
+
+ # divide development set for cross validation
+ if [ -d ${data} ];then
+     for i in US UK IND CHN JPN PT RU KR CA ES;do
+         ./utils/subset_data_dir.sh --spk-list local/files/cvlist/${i}_cv_spk $data/data_all $data/cv/$i
+         cat $data/cv/$i/feats.scp >> $data/cv.scp
+     done
+     ./utils/filter_scp.pl --exclude $data/cv.scp $data/data_all/feats.scp > $data/train_and_dev.scp
+     #95-5 split for dev set
+     sed -n '0~20p' $data/train_and_dev.scp > $data/dev.scp
+     ./utils/filter_scp.pl --exclude $data/dev.scp $data/train_and_dev.scp > $data/train.scp
+     ./utils/subset_data_dir.sh --utt-list $data/train.scp $data/data_all $data/train_org
+     ./utils/subset_data_dir.sh --utt-list $data/dev.scp $data/data_all $data/dev_org
+     ./utils/subset_data_dir.sh --utt-list $data/cv.scp $data/data_all $data/cv_all
+ fi
+
+echo "local/subset_data.sh succeeded"
+exit 0;
diff --git a/egs/aesrc2020/asr1/local/data_prep.sh b/egs/aesrc2020/asr1/local/data_prep.sh
new file mode 100755
index 00000000000..4d5b26bd217
--- /dev/null
+++ b/egs/aesrc2020/asr1/local/data_prep.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+# Copyright 2020 Audio, Speech and Language Processing Group @ NWPU (Author: Xian Shi)
+# Apache 2.0
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+
+raw_data=$1     # raw data with metadata, txt and wav
+data=$2         # data transformed into kaldi format
+
+# generate kaldi format data for all
+if [ -d ${raw_data} ];then 
+    echo "Generating kaldi format data."
+    mkdir -p $data/data_all
+    find $raw_data -type f -name "*.wav" > $data/data_all/wavpath
+    awk -F'/' '{print $(NF-2)"-"$(NF-1)"-"$NF}' $data/data_all/wavpath | sed 's:\.wav::g' > $data/data_all/uttlist
+    paste $data/data_all/uttlist $data/data_all/wavpath > $data/data_all/wav.scp
+    python local/preprocess.py $data/data_all/wav.scp $data/data_all/trans $data/data_all/utt2spk # faster than for in shell
+    ./utils/utt2spk_to_spk2utt.pl $data/data_all/utt2spk > $data/data_all/spk2utt
+fi
+
+# clean transcription
+if [ -d $data/data_all ];then
+    echo "Cleaning transcription."
+    tr '[a-z]' '[A-Z]' < $data/data_all/trans > $data/data_all/trans_upper
+    # turn "." in specific abbreviations into "<m>" tag
+    sed -i -e 's: MR\.: MR<m>:g' -e 's: MRS\.: MRS<m>:g' -e 's: MS\.: MS<m>:g' \
+        -e 's:^MR\.:MR<m>:g' -e 's:^MRS\.:MRS<m>:g' -e 's:^MS\.:MS<m>:g' $data/data_all/trans_upper 
+	# fix bug
+    sed -i 's:^ST\.:STREET:g' $data/data_all/trans_upper 
+    sed -i 's: ST\.: STREET:g' $data/data_all/trans_upper 
+    # punctuation marks
+    sed -i "s%,\|\.\|?\|!\|;\|-\|:\|,'\|\.'\|?'\|!'\| '% %g" $data/data_all/trans_upper
+    sed -i 's:<m>:.:g' $data/data_all/trans_upper
+    # blank
+    sed -i 's:[ ][ ]*: :g' $data/data_all/trans_upper
+    paste $data/data_all/uttlist $data/data_all/trans_upper > $data/data_all/text
+
+    # critally, must replace tab with space between uttid and text
+    sed -e "s/\t/ /g" -i $data/data_all/text
+fi
+
+echo "local/data_prep.sh succeeded"
+exit 0;
diff --git a/egs/aesrc2020/asr1/local/download_and_untar.sh b/egs/aesrc2020/asr1/local/download_and_untar.sh
new file mode 100755
index 00000000000..046ce35bb1b
--- /dev/null
+++ b/egs/aesrc2020/asr1/local/download_and_untar.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+
+zipped_data=$1
+raw_data=$2/Datatang-English/data
+
+# unzip and rename each accent
+unzip $zipped_data -d ${2}
+mv $raw_data/American\ English\ Speech\ Data $raw_data/US
+mv $raw_data/British\ English\ Speech\ Data $raw_data/UK
+mv $raw_data/Chinese\ Speaking\ English\ Speech\ Data $raw_data/CHN 
+mv $raw_data/Indian\ English\ Speech\ Data $raw_data/IND 
+mv $raw_data/Portuguese\ Speaking\ English\ Speech\ Data $raw_data/PT 
+mv $raw_data/Russian\ Speaking\ English\ Speech\ Data $raw_data/RU 
+mv $raw_data/Japanese\ Speaking\ English\ Speech\ Data $raw_data/JPN 
+mv $raw_data/Korean\ Speaking\ English\ Speech\ Data $raw_data/KR
+mv $raw_data/Canadian\ English\ Speech\ Data $raw_data/CA 
+mv $raw_data/Spanish\ Speaking\ English\ Speech\ Data $raw_data/ES
+
+echo "local/download_and_untar.sh succeeded"
+exit 0;
diff --git a/egs/aesrc2020/asr1/local/files/ar.dict b/egs/aesrc2020/asr1/local/files/ar.dict
new file mode 100644
index 00000000000..d17cfb0a5e0
--- /dev/null
+++ b/egs/aesrc2020/asr1/local/files/ar.dict
@@ -0,0 +1,8 @@
+<US> 0
+<UK> 1
+<CHN> 2
+<IND> 3
+<JPN> 4
+<KR> 5
+<PT> 6
+<RU> 7
diff --git a/egs/aesrc2020/asr1/local/files/cvlist/CA_cv_spk b/egs/aesrc2020/asr1/local/files/cvlist/CA_cv_spk
new file mode 100644
index 00000000000..9362f7dc693
--- /dev/null
+++ b/egs/aesrc2020/asr1/local/files/cvlist/CA_cv_spk
@@ -0,0 +1,4 @@
+CA-G00034
+CA-G00086
+CA-G00414
+CA-G20113
diff --git a/egs/aesrc2020/asr1/local/files/cvlist/CHN_cv_spk b/egs/aesrc2020/asr1/local/files/cvlist/CHN_cv_spk
new file mode 100644
index 00000000000..f5ed8b6241c
--- /dev/null
+++ b/egs/aesrc2020/asr1/local/files/cvlist/CHN_cv_spk
@@ -0,0 +1,4 @@
+CHN-G00190
+CHN-G00992
+CHN-G61365
+CHN-G01372
\ No newline at end of file
diff --git a/egs/aesrc2020/asr1/local/files/cvlist/ES_cv_spk b/egs/aesrc2020/asr1/local/files/cvlist/ES_cv_spk
new file mode 100644
index 00000000000..509dd652f44
--- /dev/null
+++ b/egs/aesrc2020/asr1/local/files/cvlist/ES_cv_spk
@@ -0,0 +1,4 @@
+ES-G00714
+ES-G01878
+ES-G11701
+ES-G20575
diff --git a/egs/aesrc2020/asr1/local/files/cvlist/IND_cv_spk b/egs/aesrc2020/asr1/local/files/cvlist/IND_cv_spk
new file mode 100644
index 00000000000..72b5df67cf8
--- /dev/null
+++ b/egs/aesrc2020/asr1/local/files/cvlist/IND_cv_spk
@@ -0,0 +1,4 @@
+IND-G00892
+IND-G01006
+IND-G01501
+IND-G0760
\ No newline at end of file
diff --git a/egs/aesrc2020/asr1/local/files/cvlist/JPN_cv_spk b/egs/aesrc2020/asr1/local/files/cvlist/JPN_cv_spk
new file mode 100644
index 00000000000..957a43af30b
--- /dev/null
+++ b/egs/aesrc2020/asr1/local/files/cvlist/JPN_cv_spk
@@ -0,0 +1,4 @@
+JPN-G00040
+JPN-G00125
+JPN-G00354
+JPN-G20194
\ No newline at end of file
diff --git a/egs/aesrc2020/asr1/local/files/cvlist/KR_cv_spk b/egs/aesrc2020/asr1/local/files/cvlist/KR_cv_spk
new file mode 100644
index 00000000000..0e078514d72
--- /dev/null
+++ b/egs/aesrc2020/asr1/local/files/cvlist/KR_cv_spk
@@ -0,0 +1,4 @@
+KR-G00022
+KR-G00276
+KR-G10029
+KR-G10122
\ No newline at end of file
diff --git a/egs/aesrc2020/asr1/local/files/cvlist/PT_cv_spk b/egs/aesrc2020/asr1/local/files/cvlist/PT_cv_spk
new file mode 100644
index 00000000000..89f09e4756e
--- /dev/null
+++ b/egs/aesrc2020/asr1/local/files/cvlist/PT_cv_spk
@@ -0,0 +1,5 @@
+PT-G00600
+PT-G00643
+PT-G00963
+PT-G10618
+PT-G20539
\ No newline at end of file
diff --git a/egs/aesrc2020/asr1/local/files/cvlist/RU_cv_spk b/egs/aesrc2020/asr1/local/files/cvlist/RU_cv_spk
new file mode 100644
index 00000000000..3069b2e4f6f
--- /dev/null
+++ b/egs/aesrc2020/asr1/local/files/cvlist/RU_cv_spk
@@ -0,0 +1,4 @@
+RU-G00163
+RU-G00196
+RU-G00439
+RU-G10416
\ No newline at end of file
diff --git a/egs/aesrc2020/asr1/local/files/cvlist/UK_cv_spk b/egs/aesrc2020/asr1/local/files/cvlist/UK_cv_spk
new file mode 100644
index 00000000000..fe7cd8b43cd
--- /dev/null
+++ b/egs/aesrc2020/asr1/local/files/cvlist/UK_cv_spk
@@ -0,0 +1,8 @@
+UK-G00025
+UK-G00808
+UK-G01337
+UK-G01807
+UK-G10261
+UK-G11032
+UK-G11739
+UK-G40517
\ No newline at end of file
diff --git a/egs/aesrc2020/asr1/local/files/cvlist/US_cv_spk b/egs/aesrc2020/asr1/local/files/cvlist/US_cv_spk
new file mode 100644
index 00000000000..760edcea2a0
--- /dev/null
+++ b/egs/aesrc2020/asr1/local/files/cvlist/US_cv_spk
@@ -0,0 +1,6 @@
+US-G00007
+US-G01459
+US-G10948
+US-G20537
+US-G20939
+US-G30201
\ No newline at end of file
diff --git a/egs/aesrc2020/asr1/local/preprocess.py b/egs/aesrc2020/asr1/local/preprocess.py
new file mode 100755
index 00000000000..f5939848f4e
--- /dev/null
+++ b/egs/aesrc2020/asr1/local/preprocess.py
@@ -0,0 +1,18 @@
+# Copyright 2020 Audio, Speech and Language Processing Group @ NWPU (Author: Xian Shi)
+# Apache 2.0
+
+import sys
+
+fin = open(sys.argv[1], "r")
+fout_text = open(sys.argv[2], "w")
+fout_utt2spk = open(sys.argv[3], "w")
+
+for line in fin.readlines():
+    uttid, path = line.strip("\n").split("\t")
+    text_path = path.replace(".wav", ".txt")
+    text_ori = open(text_path, "r").readlines()[0].strip("\n")
+    feild = path.split("/")
+    accid = feild[-3]
+    spkid = accid + "-" + feild[-2]
+    fout_utt2spk.write(uttid + "\t" + spkid + "\n")
+    fout_text.write(text_ori + "\n")
diff --git a/egs/aesrc2020/asr1/path.sh b/egs/aesrc2020/asr1/path.sh
new file mode 100644
index 00000000000..d405bf59826
--- /dev/null
+++ b/egs/aesrc2020/asr1/path.sh
@@ -0,0 +1,17 @@
+MAIN_ROOT=$PWD/../../..
+KALDI_ROOT=$MAIN_ROOT/tools/kaldi
+
+
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$MAIN_ROOT/tools/chainer_ctc/ext/warp-ctc/build
+. "${MAIN_ROOT}"/tools/activate_python.sh && . "${MAIN_ROOT}"/tools/extra_path.sh
+export PATH=$MAIN_ROOT/utils:$MAIN_ROOT/espnet/bin:$PATH
+
+export OMP_NUM_THREADS=1
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
diff --git a/egs/aesrc2020/asr1/run.sh b/egs/aesrc2020/asr1/run.sh
new file mode 100755
index 00000000000..1cd0d51791f
--- /dev/null
+++ b/egs/aesrc2020/asr1/run.sh
@@ -0,0 +1,322 @@
+#!/usr/bin/env bash
+
+# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+
+# general configuration
+backend=pytorch
+stage=-1       # start from -1 if you need to start from data download
+stop_stage=100
+ngpu=8         # number of gpus ("0" uses cpu, otherwise use gpu)
+nj=32
+debugmode=1
+dumpdir=dump   # directory to dump full features
+N=0            # number of minibatches to be used (mainly for debugging). "0" uses all minibatches.
+verbose=0      # verbose option
+resume=        # Resume the training from snapshot
+
+# feature configuration
+do_delta=false
+
+preprocess_config=conf/specaug.yaml
+train_config=conf/train.yaml # current default recipe requires 4 gpus.
+                             # if you do not have 4 gpus, please reconfigure the `batch-bins` and `accum-grad` parameters in config.
+lm_config=conf/lm.yaml
+decode_config=conf/decode.yaml
+
+# rnnlm related
+lm_resume= # specify a snapshot file to resume LM training
+lmtag=     # tag for managing LMs
+
+# decoding parameter
+recog_model=model.acc.best  # set a model to be used for decoding: 'model.acc.best' or 'model.loss.best'
+lang_model=rnnlm.model.best # set a language model to be used for decoding
+
+# model average realted (only for transformer)
+n_average=5                  # the number of ASR models to be averaged
+use_valbest_average=true     # if true, the validation `n_average`-best ASR models will be averaged.
+                             # if false, the last `n_average` ASR models will be averaged.
+lm_n_average=0               # the number of languge models to be averaged
+use_lm_valbest_average=false # if true, the validation `lm_n_average`-best language models will be averaged.
+                             # if false, the last `lm_n_average` language models will be averaged.
+
+# Set this to somewhere where you want to put your data, or where
+# someone else has already put it.  You'll want to change this
+# if you're not on the CLSP grid.
+datadir=
+
+# The AESRC2020 data needs to be requested via services@datatang.com
+# The provided data will be a zip
+datazip=
+
+# bpemode (unigram or bpe)
+nbpe=5000
+bpemode=unigram
+
+# exp tag
+tag="" # tag for managing experiments.
+
+. utils/parse_options.sh || exit 1;
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train
+train_sp=train_sp
+train_dev=dev
+recog_set="US UK IND CHN JPN PT RU KR CA ES"
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    echo "stage -1: Data Download"
+    if [ ! -f ${datazip} ]; then
+        echo "The AESRC2020 data needs to be requested via services@datatang.com"
+        exit 1
+    fi
+    local/download_and_untar.sh ${datazip} ${datadir}
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    ### Task dependent. You have to make data the following preparation part by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+    echo "stage 0: Data preparation"
+    local/data_prep.sh $datadir/Datatang-English/data data
+    ./utils/fix_data_dir.sh data/data_all
+fi
+
+feat_tr_dir=${dumpdir}/${train_set}/delta${do_delta}; mkdir -p ${feat_tr_dir}
+feat_sp_dir=${dumpdir}/${train_sp}/delta${do_delta}; mkdir -p ${feat_sp_dir}
+feat_dt_dir=${dumpdir}/${train_dev}/delta${do_delta}; mkdir -p ${feat_dt_dir}
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    ### Task dependent. You have to design training and dev sets by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+    echo "stage 1: Feature Generation"
+    fbankdir=fbank
+    # Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame
+    steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj ${nj} --write_utt2num_frames true \
+        data/data_all exp/make_fbank/data_all ${fbankdir}
+    utils/fix_data_dir.sh data/data_all
+
+    # Data splits
+    local/create_subsets.sh data
+
+    utils/perturb_data_dir_speed.sh 0.9  data/${train_set}_org data/temp1
+    utils/perturb_data_dir_speed.sh 1.0  data/${train_set}_org data/temp2
+    utils/perturb_data_dir_speed.sh 1.1  data/${train_set}_org data/temp3
+
+    utils/combine_data.sh --extra-files utt2uniq data/${train_sp}_org data/temp1 data/temp2 data/temp3
+
+    # remove utt having more than 3000 frames
+    # remove utt having more than 400 characters
+    remove_longshortdata.sh --maxframes 3000 --maxchars 400 data/${train_set}_org data/${train_set}
+    remove_longshortdata.sh --maxframes 3000 --maxchars 400 data/${train_sp}_org data/${train_sp}
+    remove_longshortdata.sh --maxframes 3000 --maxchars 400 data/${train_dev}_org data/${train_dev}
+    steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj $nj  --write_utt2num_frames true \
+            data/$train_sp  exp/make_fbank/$train_sp  ${fbankdir}
+    rm data/train_sp/utt2dur    #hacked
+    utils/fix_data_dir.sh data/train_sp
+    # compute global CMVN
+    compute-cmvn-stats scp:data/${train_sp}/feats.scp data/${train_sp}/cmvn.ark
+ 
+    # dump features for training
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${feat_tr_dir}/storage ]; then
+    utils/create_split_dir.pl \
+        /export/b{14,15,16,17}/${USER}/espnet-data/egs/librispeech/asr1/dump/${train_set}/delta${do_delta}/storage \
+        ${feat_tr_dir}/storage
+    fi
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${feat_dt_dir}/storage ]; then
+    utils/create_split_dir.pl \
+        /export/b{14,15,16,17}/${USER}/espnet-data/egs/librispeech/asr1/dump/${train_dev}/delta${do_delta}/storage \
+        ${feat_dt_dir}/storage
+    fi
+    dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta ${do_delta} \
+        data/${train_sp}/feats.scp data/${train_sp}/cmvn.ark exp/dump_feats/$train_sp ${feat_sp_dir}
+    dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta ${do_delta} \
+        data/${train_dev}/feats.scp data/${train_sp}/cmvn.ark exp/dump_feats/data_all ${feat_dt_dir}
+    for rtask in ${recog_set}; do
+        feat_recog_dir=${dumpdir}/${rtask}/delta${do_delta}; mkdir -p ${feat_recog_dir}
+        dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta ${do_delta} \
+            data/cv/${rtask}/feats.scp data/${train_sp}/cmvn.ark exp/dump_feats/recog/data_all \
+            ${feat_recog_dir}
+    done
+fi
+
+dict=data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt
+bpemodel=data/lang_char/${train_set}_${bpemode}${nbpe}
+echo "dictionary: ${dict}"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    ### Task dependent. You have to check non-linguistic symbols used in the corpus.
+    echo "stage 2: Dictionary and Json Data Preparation"
+    mkdir -p data/lang_char/
+    echo "<unk> 1" > ${dict} # <unk> must be 1, 0 will be used for "blank" in CTC
+    cut -f 2- -d" " data/${train_set}/text > data/lang_char/input.txt
+    spm_train --input=data/lang_char/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000
+    spm_encode --model=${bpemodel}.model --output_format=piece < data/lang_char/input.txt | tr ' ' '\n' | sort | uniq | awk '{print $0 " " NR+1}' >> ${dict}
+    wc -l ${dict}
+
+    # make json labels
+    data2json.sh --nj ${nj} --feat ${feat_sp_dir}/feats.scp --bpecode ${bpemodel}.model \
+        data/${train_sp} ${dict} > ${feat_sp_dir}/data_${bpemode}${nbpe}.json
+    data2json.sh --nj ${nj} --feat ${feat_dt_dir}/feats.scp --bpecode ${bpemodel}.model \
+        data/${train_dev} ${dict} > ${feat_dt_dir}/data_${bpemode}${nbpe}.json
+
+    for rtask in ${recog_set}; do
+        feat_recog_dir=${dumpdir}/${rtask}/delta${do_delta}
+        data2json.sh --nj ${nj} --feat ${feat_recog_dir}/feats.scp --bpecode ${bpemodel}.model \
+            data/cv/${rtask} ${dict} > ${feat_recog_dir}/data_${bpemode}${nbpe}.json
+    done
+fi
+
+# You can skip this and remove --rnnlm option in the recognition (stage 5)
+if [ -z ${lmtag} ]; then
+    lmtag=$(basename ${lm_config%.*})
+fi
+lmexpname=train_rnnlm_${backend}_${lmtag}_${bpemode}${nbpe}_ngpu${ngpu}
+lmexpdir=exp/${lmexpname}
+mkdir -p ${lmexpdir}
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    echo "stage 3: LM Preparation"
+    lmdatadir=data/local/lm_train_${bpemode}${nbpe}
+    # use external data
+    if [ ! -e data/local/lm_train/librispeech-lm-norm.txt.gz ]; then
+        wget http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz -P data/local/lm_train/
+    fi
+    if [ ! -e ${lmdatadir} ]; then
+        mkdir -p ${lmdatadir}
+        cut -f 2- -d" " data/${train_set}/text | gzip -c > data/local/lm_train/${train_set}_text.gz
+        # combine external text and transcriptions and shuffle them with seed 777
+        zcat data/local/lm_train/librispeech-lm-norm.txt.gz data/local/lm_train/${train_set}_text.gz |\
+            spm_encode --model=${bpemodel}.model --output_format=piece > ${lmdatadir}/train.txt
+        cut -f 2- -d" " data/${train_dev}/text | spm_encode --model=${bpemodel}.model --output_format=piece \
+                                                            > ${lmdatadir}/valid.txt
+    fi
+    ${cuda_cmd} --gpu ${ngpu} ${lmexpdir}/train.log \
+        lm_train.py \
+        --config ${lm_config} \
+        --ngpu ${ngpu} \
+        --backend ${backend} \
+        --verbose 1 \
+        --outdir ${lmexpdir} \
+        --tensorboard-dir tensorboard/${lmexpname} \
+        --train-label ${lmdatadir}/train.txt \
+        --valid-label ${lmdatadir}/valid.txt \
+        --resume ${lm_resume} \
+        --dict ${dict} \
+        --dump-hdf5-path ${lmdatadir}
+fi
+
+if [ -z ${tag} ]; then
+    expname=${train_set}_${backend}_$(basename ${train_config%.*})
+    if ${do_delta}; then
+        expname=${expname}_delta
+    fi
+    if [ -n "${preprocess_config}" ]; then
+        expname=${expname}_$(basename ${preprocess_config%.*})
+    fi
+else
+    expname=${train_set}_${backend}_${tag}
+fi
+expdir=exp/${expname}
+mkdir -p ${expdir}
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    echo "stage 4: Network Training"
+    ${cuda_cmd} --gpu ${ngpu} ${expdir}/train.log \
+        asr_train.py \
+        --config ${train_config} \
+        --preprocess-conf ${preprocess_config} \
+        --ngpu ${ngpu} \
+        --backend ${backend} \
+        --outdir ${expdir}/results \
+        --tensorboard-dir tensorboard/${expname} \
+        --debugmode ${debugmode} \
+        --dict ${dict} \
+        --debugdir ${expdir} \
+        --minibatches ${N} \
+        --verbose ${verbose} \
+        --resume ${resume} \
+        --train-json ${feat_sp_dir}/data_${bpemode}${nbpe}.json \
+        --valid-json ${feat_dt_dir}/data_${bpemode}${nbpe}.json
+fi
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    echo "stage 5: Decoding"
+    if [[ $(get_yaml.py ${train_config} model-module) = *transformer* ]] || \
+           [[ $(get_yaml.py ${train_config} model-module) = *conformer* ]] || \
+           [[ $(get_yaml.py ${train_config} etype) = custom ]] || \
+           [[ $(get_yaml.py ${train_config} dtype) = custom ]]; then
+        # Average ASR models
+        if ${use_valbest_average}; then
+            recog_model=model.val${n_average}.avg.best
+            opt="--log ${expdir}/results/log"
+        else
+            recog_model=model.last${n_average}.avg.best
+            opt="--log"
+        fi
+        average_checkpoints.py \
+            ${opt} \
+            --backend ${backend} \
+            --snapshots ${expdir}/results/snapshot.ep.* \
+            --out ${expdir}/results/${recog_model} \
+            --num ${n_average}
+
+        # Average LM models
+        if [ ${lm_n_average} -eq 0 ]; then
+            lang_model=rnnlm.model.best
+        else
+            if ${use_lm_valbest_average}; then
+                lang_model=rnnlm.val${lm_n_average}.avg.best
+                opt="--log ${lmexpdir}/log"
+            else
+                lang_model=rnnlm.last${lm_n_average}.avg.best
+                opt="--log"
+            fi
+            average_checkpoints.py \
+                ${opt} \
+                --backend ${backend} \
+                --snapshots ${lmexpdir}/snapshot.ep.* \
+                --out ${lmexpdir}/${lang_model} \
+                --num ${lm_n_average}
+        fi
+    fi
+
+    pids=() # initialize pids
+    for rtask in ${recog_set}; do
+    (
+        decode_dir=decode_${rtask}_${recog_model}_$(basename ${decode_config%.*})_${lmtag}
+        feat_recog_dir=${dumpdir}/${rtask}/delta${do_delta}
+
+        # split data
+        splitjson.py --parts ${nj} ${feat_recog_dir}/data_${bpemode}${nbpe}.json
+
+        #### use CPU for decoding
+        ngpu=0
+
+        # set batchsize 0 to disable batch decoding
+        ${decode_cmd} JOB=1:${nj} ${expdir}/${decode_dir}/log/decode.JOB.log \
+            asr_recog.py \
+            --config ${decode_config} \
+            --ngpu ${ngpu} \
+            --backend ${backend} \
+            --batchsize 0 \
+            --recog-json ${feat_recog_dir}/split${nj}utt/data_${bpemode}${nbpe}.JOB.json \
+            --result-label ${expdir}/${decode_dir}/data.JOB.json \
+            --model ${expdir}/results/${recog_model}  \
+            --rnnlm ${lmexpdir}/${lang_model} \
+            --api v2
+
+        score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel}.model --wer true ${expdir}/${decode_dir} ${dict}
+
+    ) &
+    pids+=($!) # store background pids
+    done
+    i=0; for pid in "${pids[@]}"; do wait ${pid} || ((++i)); done
+    [ ${i} -gt 0 ] && echo "$0: ${i} background jobs are failed." && false
+    echo "Finished"
+fi
diff --git a/egs/lrs/asr1/steps b/egs/aesrc2020/asr1/steps
similarity index 100%
rename from egs/lrs/asr1/steps
rename to egs/aesrc2020/asr1/steps
diff --git a/egs/lrs/asr1/utils b/egs/aesrc2020/asr1/utils
similarity index 100%
rename from egs/lrs/asr1/utils
rename to egs/aesrc2020/asr1/utils
diff --git a/egs/commonvoice/asr1/local/download_and_untar.sh b/egs/commonvoice/asr1/local/download_and_untar.sh
index 1f5c40d9b0e..cce26302127 100755
--- a/egs/commonvoice/asr1/local/download_and_untar.sh
+++ b/egs/commonvoice/asr1/local/download_and_untar.sh
@@ -16,7 +16,7 @@ fi
 
 if [ $# -ne 3 ]; then
   echo "Usage: $0 [--remove-archive] <data-base> <url> <filename>"
-  echo "e.g.: $0 /export/data/ https://common-voice-data-download.s3.amazonaws.com/cv_corpus_v1.tar.gz cv_corpus_v1.tar.gz"
+  echo "e.g.: $0 /export/data/ https://us.openslr.org/resources/108/FR.tgz"
   echo "With --remove-archive it will remove the archive after successfully un-tarring it."
   exit 0;
 fi
diff --git a/egs/lrs/README.md b/egs/lrs/README.md
new file mode 100644
index 00000000000..26f623cd08b
--- /dev/null
+++ b/egs/lrs/README.md
@@ -0,0 +1,335 @@
+# ESPnet-AVSR
+
+## Introduction
+This repository contains an implementation of end-to-end (E2E) audio-visual speech recognition (AVSR) based on the ESPnet ASR toolkit. The new fusion strategy follows the paper "Fusing information streams in end-to-end audio-visual speech recognition." (https://ieeexplore.ieee.org/document/9414553) [[1]](#literature). A broad range of reliability measures are used to help the integration model improve the performance of the AVSR model. We use two large-vocabulary datasets, the Lip Reading Sentences 2 and 3 corpora for all our experiments.
+In addition, this project also contains an audio-only model for comparison.
+
+## Table of Contents 
+- [Installation](#installation-of-required-packages)
+  * [Requirements](#requirements)
+- [Project Structure](#project-structure)
+  * [Basics](#project-structure)
+  * [AVSR1](#detailed-description-of-avsr1)
+- [Usage of the scripts](#running-the-script)
+  + [Notes](#notes)
+
+
+## Installation of required packages
+
+### Requirements
+
+For installation, approximately 40GB of free disk space is needed. avsr1/run.sh stage 0 installs all required packages in avsr1/local/installations:
+    
+**Required Packages:**
+1. ESPNet: https://github.com/espnet/espnet
+1. OpenFace: https://github.com/TadasBaltrusaitis/OpenFace
+2. DeepXi: https://github.com/anicolson/DeepXi
+3. Vidaug: https://github.com/okankop/vidaug
+
+<!-- **Prerequirements:**
+The following packages needs to be installed in advance to be able to run the scripts:
+1. Git: 
+```console 
+foo@bar:~$ sudo apt-get install git
+```
+2. Python-venv package:
+```console 
+foo@bar:~$ sudo apt-get install python3-venv
+```
+3. Python-dev packages (development package):
+```console 
+foo@bar:~$ sudo apt-get install python3-dev # for python 3.+
+foo@bar:~$ sudo apt-get install python-dev # for python 2.+ (optional)
+```
+### Files
+The following shell scripts in the installation directory (<code>install/</code>) are important 
+ * [install_espnet.sh](install/install_espnet.sh): Script to install ESPnet. For more information see [Option 1](#option-1-installation-scripts-for-every-package-preferred).
+ * [install_openface.sh](install/install_openface.sh): Script to install OpenFace. For more information see [Option 1](#option-1-installation-scripts-for-every-package-preferred).
+ * [install_deepxi.sh](install/install_deepxi.sh): Script to install DeepXi. For more information see [Option 1](#option-1-installation-scripts-for-every-package-preferred).
+ * [install_videoaug.sh](install/install_videoaug.sh): Script to install Videoaug. For more information see [Option 1](#option-1-installation-scripts-for-every-package-preferred).
+ * [install_avsr.sh](install/install_avsr.sh): Script to install all packages. For more information see [Option 2](#option-2-installation-script-for-all-packages-not-preferred). 
+
+### Installation
+The script install_avsr.sh installs all the requirements (OpenFace, DeepXi, ffmpeg). Be careful in using the script, it was only tested on Ubuntu 20.04 but should also work on Ubuntu 18.04. We assume no liability for damage to your equipment. If any of the requirements are already installed, please install the other packages manually by following the provided links or follow the procedure for this specific package in the installation script. It was tried to build the script so that the Software OpenFace, DeepXI, and ffmpeg can be installed separately. 
+Furthermore, if there is an error, the installation stops and you need to follow the error messages to fix them. Some common error sources with mitigation are listed in the section [Common Errors](#common-errors).
+
+>However, the **preferred option is to run each of the scripts** (`install_openface.sh`, `install_deepxi.sh`) **separately** ([Option 1](#option-1-installation-scripts-for-every-package-preferred)). In this case it is easier to react in case of errors.
+
+#### Option 1: Installation scripts for every package (preferred)
+1. Install ESPnet. During installation, the user must specify whether CUDA is installed and whether CUDA should be used. The script was written for CUDA 10.0, assuming the default installation path (`/usr/local/cuda-10.0`) for CUDA 10.0. **If the CUDA installation path is different or a different CUDA version should be used, the variable CUDA_PATH must be adjusted in the file install_espnet.sh.** Run the script using simple bash command. 
+```console
+foo@bar:~/install$ bash install_espnet.sh
+```
+2. Install OpenFace
+```console
+foo@bar:~/install$ bash install_openface.sh
+```
+If there are problems regarding cmake, please refer to [CMake errors](#cmake-version-not-supported) for further instructions.
+
+3. Install DeepXi
+```console
+foo@bar:~/install$ bash install_deepxi.sh
+```
+4. Install Videoaug (ESPnet must be installed in advance because Videoaug is installed in ESPnet environment)
+```console
+foo@bar:~/install$ bash install_deepxi.sh
+```
+
+#### Option 2: Installation script for all packages (not preferred)
+To install all packages at once, run the command
+```console
+foo@bar:~/install$ bash install_avsr.sh
+```
+You can select which packages should be installed while running the bash script.
+If there are problems regarding CMake, please refer to [CMake errors](#cmake-version-not-supported) for further instructions.
+
+### Common Errors
+#### CMake version not supported
+##### Using provided script for Installing CMake 3.10.2
+Some errors are associated with a non matching CMake version. The minimum required version is CMake 3.10.2. The script avsr_install.sh can install CMake 3.10.2 with the following command automatically and uses this version for the current session:
+```console
+foo@bar:~/install$ bash install_avsr.sh INSTALL_CMAKE # or for openface script: bash install_openface.sh INSTALL_CMAKE 
+```
+To install the CMake version 3.10.2 and use it as your standard CMake version, please use the script with the option:
+```console
+foo@bar:~/install$ bash install_avsr.sh INSTALL_CMAKE_PERMANENT # or for openface script: bash install_openface.sh INSTALL_CMAKE_PERMANENT
+```
+##### Using arbitrary CMake versions
+If you want to install CMake manually for an arbirtray version, please perform the following steps. Be carefull to replace the version used here (3.10.2) with the correct CMake version you want to install.
+###### 1. Open a terminal
+###### 2. Download and unpack files: 
+Enter the following command to download the source code. Replace with wanted version. 
+Here Version 3.10.2 is downloaded.
+```console
+foo@bar:~$ wget https://github.com/Kitware/CMake/releases/download/v3.10.2/cmake-3.10.2.tar.gz
+foo@bar:~$ tar -zxvf cmake-3.10.2.tar.gz 
+foo@bar:~$ rm cmake-3.10.2.tar.gz
+```
+###### 3. Installation
+Execute the following steps to perform the installation:
+```console
+foo@bar:~$ cd cmake-3.10.2
+foo@bar:~/cmake-3.10.2$ ./bootstrap
+foo@bar:~/cmake-3.10.2$ make
+foo@bar:~/cmake-3.10.2$ sudo make install
+foo@bar:~/cmake-3.10.2$ cd ..
+```
+###### 4. Add to path (temporal/permanently)
+If you want to add the path temporal for this terminal session, execute:
+```console
+foo@bar:~$ export PATH="`pwd`/cmake-3.10.2/bin:$PATH"
+```
+
+Otherwise, if this CMake version should be the standard CMake version, the .bashrc file needs to be edited. In your home directory open a terminal:
+```console
+foo@home:~$
+```
+Open the .bashrc file with a text editor, e.g. nano:
+```console
+foo@home:~$ sudo nano .bashrc
+```
+Add the following line to the .bashrc file (replace path-to-cmake with the CMake installation path, e.g. /home/foo/cmake-3.10.2):
+```console
+foo@home:~$ export PATH="/path-to-cmake/bin:$PATH"
+```
+Save and close the file. For updating purposes, reload the .bashrc settings, execute:
+```console
+foo@home:~$ source ~/.bashrc
+```
+
+**The current CMake version used can always be checked with the command:**
+The current version used can also be checked with
+```console
+foo@home:~$ cmake --version
+```
+#### GCC or G++ Error
+Some packages requires a specific GCC or G++ version. To install and use multiple GCC or G++ versions, open a terminal and execute:
+```console
+foo@home:~$ sudo apt-get install gcc-7 g++-7 gcc-8 g++-8 gcc-9 g++-9
+foo@home:~$ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-7 7
+foo@home:~$ sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-7 7
+foo@home:~$ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-8 8
+foo@home:~$ sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-8 8
+foo@home:~$ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 9
+foo@home:~$ sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-9 9
+```
+This installs different gcc versions and creates a list of multiple GCC and G++ compiler version.
+To check and select the available versions, for the GCC compiler run
+```console
+foo@home:~$ sudo update-alternatives --config gcc
+```
+and for the G++ compiler run
+```console
+foo@home:~$ sudo update-alternatives --config g++
+```
+The current version used can also be checked with
+```console
+foo@home:~$ gcc --version
+foo@home:~$ g++ --version
+```
+Thanks to: https://linuxconfig.org/how-to-switch-between-multiple-gcc-and-g-compiler-versions-on-ubuntu-20-04-lts-focal-fossa
+
+#### DeepXi getting stuck at grpcio
+While installing DeepXi, it could be possible that the tensorflow installation is getting stuck in the process 
+```console
+Running setup.py bdist_wheel for grpcio
+```
+This might be due to the fact that the pip version needs to be upgraded. To upgrade pip, please activate the DeepXi environment:
+```console
+foo@bar:~$ source ~/venv/DeepXi/bin/activate
+(DeepXi) foo@bar: pip3 install --upgrade pip
+(DeepXi) foo@bar: deactivate
+```
+Now, rerun the DeepXi installation procedure (e.g. via install_deepxi.sh script)-->
+
+## Project structure
+The main folder <code>avsr1/</code>, contains the code for the audio-visual speech recognition system, also trained on the LRS2 [[2]](#literature) dataset together with the LRS3 dataset (https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs3.html) [[3]](#literature). It follows the basic ESPnet structure. 
+The main code for the recognition system is the <code>run.sh</code> script. In the script, the workflow of the systems is performed in multiple stages:
+
+|                                  AVSR                       |
+|-------------------------------------------------------------|
+| Stage 0: Install required packages                     |
+| Stage 1: Data Download and preparation                     |
+| Stage 2: Audio augmentation                                 | 
+| Stage 3: MP3 files and Feature Generation                   |
+| Stage 4: Dictionary and JSON data preparation               | 
+| Stage 5: Reliability measures generation                    |
+| Stage 6: Language model trainin                             |
+| Stage 7: Training of the E2E-AVSR model and Decoding        |
+
+
+<!--The folder structure for both systems is basically:
+* <code>conf/</code>: contains configuration files for the training, decoding, and feature extraction 
+* <code>data/</code>: directory for storing data
+* <code>exp/</code>: log files, model parameters, training results
+* <code>fbank/</code>: speech feature binary files, e.g., ark, scp
+* <code>dump*/</code> : ESPnet meta data for tranining, e.g., json, hdf5 
+* <code>local/</code>: Contains local runtime scripts for data processing, data augmentation and own written functions (e.g. face recognition in the AVSR system) that are not part of the ESPnet standard processing scripts. During the training stage, a symbolic link is built to the ESPnet. After training, the link will be deleted.  
+* <code>steps/</code>: helper scripts from ESPnet (Kaldi)
+* <code>utils/</code>: helper scripts from ESPnet (Kaldi) -->
+  
+<!-- ### Detailed description of ASR1:
+##### Stage -1: Data Download
+  * Strictly considered not a separate stage, since the data set must be downloaded in advance by yourself. For downloading the dataset, please visit 'https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html/' [[2]](#literature)
+  * You will need to sign a data sharing agreement with BBC Research & Development before getting access
+  * After downloading, please edit <code>path.sh</code> file and assign the dataset directory path to the <code>DATA_DIR</code> variable
+ 
+##### Stage 0: Data Preparation in Kaldi-Style
+  * For every dataset part (pretrain, train, test, validate), prepare the data in Kaldi-Style
+  * More information about Kaldi-Style: https://kaldi-asr.org/doc/data_prep.html
+  * Segmentation: If the variable <code>segment</code> is true, the data in the pretrain set will be segmented into files with length of 5s to restrict the length of the data
+  * Generates the text, utt2spk and wav.scp files 
+
+##### Stage 1: Feature Generation
+  * Generate the fillter bank features, by default 80-dimensional filter banks with pitch on each frame
+  * Cepstral mean and variance normalization
+
+##### Stage 2: Dictionary and JSON data preparation
+  * prepare a dictionary and save the data prepared in the previous steps as .json files
+  * If a pretrained language model is used, the dictionary data is replaced
+
+##### Stage 3: Language Model Trainingg
+  * Train your own language model on the librispeech dataset (https://www.openslr.org/11/) or use a pretrained language model
+  * It is possible to skip the language model and use the system without an external language model. For this, just remove the rnnlm from the decoding stage (5)
+
+##### Stage 4: Training
+  * Training of the ASR E2E system by using pretrain and train set
+
+##### Stage 5: Decoding
+  * Decoding of the test and validation set-->
+  
+### Detailed description of AVSR1:
+
+##### Stage 0: Packages installations
+  * Install the required packages: ESPNet, OpenFace, DeepXi, Vidaug in avsr1/local/installations. To install OpenFace, you will need sudo right.
+
+##### Stage 1: Data preparation
+  * The data set LRS2 [2] must be downloaded in advance by yourself. For downloading the dataset, please visit https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html/ [2]. You will need to sign a data-sharing agreement with BBC Research & Development before getting access. After downloading, please edit <code>path.sh</code> file and assign the dataset directory path to the <code>DATA_DIR</code> variable
+  * The same applies to the LRS3 dataset https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs3.html [3]. After downloading, please edit <code>path.sh</code> file and assign the dataset directory path to the <code>DATALRS3_DIR</code> variable
+  * Download the Musan dataset for audio data augmentation and save it under <code>${MUSAN_DIR}</code> directory
+  * Download Room Impulse Response and Noise Database (RIRS-Noises) and save it under <code>RIRS_NOISES/</code> directory
+  * Run <code>audio_data_prep.sh</code> script: Create file lists for the given part of the Dataset, prepare the Kaldi files
+  * Dump useful data for training 
+  
+##### Stage 2: Audio Augmentation
+  * Augment the audio data with RIRS Noise
+  * Augment the audio data with Musan Noise
+  * The augmented files are saved under data/audio/augment whereas the clear audio files can be found in data/audio/clear for all the used datasets (Test, Validation(Val), Train and optional Pretrain)
+  
+##### Stage 3: Feature Generation
+  * Make augmented MP3 files
+  * Generate the fbank and mfcc features for the audio signals. By default, 80-dimensional filterbanks with pitch on each frame are used
+  * Compute global Cepstral mean and variance normalization (CMVN). This computes goodness of pronunciation (GOP) and extracts phone-level pronunciation features for mispronunciations detection tasks (https://kaldi-asr.org/doc/compute-cmvn-stats_8cc.html).
+  
+##### Stage 4: Dictionary and JSON data preparation
+  * Build Dictionary and JSON Data Preparation
+  * Build a tokenizer using Sentencepiece: https://github.com/google/sentencepiece
+
+##### Stage 5: Reliability measures generation
+  * Stage 5.0: Creat dump file for MFCC features
+  * Stage 5.1: Video augmentation with Gaussian blur and salt&pepper noise
+  * Stage 5.2: OpenFace face recognition for facial recognition (especially the mouth region, for further details see documentation in avsr1/local folder )
+  * Stage 5.3: Extract video frames
+  * Stage 5.4: Estimate SNRs using DeepXi framework
+  * Stage 5.5: Extract video features by pretrained video feature extractor [[4]](#literature)
+  * Stage 5.6: Make video .ark files
+  * Stage 5.7: Remake audio and video dump files
+  * Stage 5.8: Split test decode dump files by different signal-to-noise ratios
+  
+##### Stage 6: Language Model Training
+  * Train your own language model on the librispeech dataset (https://www.openslr.org/11/) or use a pretrained language model
+  * It is possible to skip the language model and use the system without an external language model. 
+  
+##### Stage 7: Network Training
+  * Train audio model
+  * Pretrain video model
+  * Finetune video model
+  * Pretrain av model
+  * Finetune av model (model used for decoding)
+  
+##### Other important references:
+  * Explanation of the CSV-file for OpenFace: https://github.com/TadasBaltrusaitis/OpenFace/wiki/Output-Format#featureextraction
+
+
+## Running the script 
+The runtime script is the script **run.sh**. It can be found in <code>avsr1/</code> directory.
+> Before running the script, please download the LRS2 (https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html) [[2]](#literature) and LRS3 (https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs3.html) [[3]](#literature) datasets by yourself and save the download paths to the variables <code>DATA_DIR</code> (LRS2 path) and <code>DATALRS3_DIR</code> (LRS3 path) inside <code>run.sh</code> file.
+  
+### Notes
+Due to the long runtime, it could be useful to run the script using screen command in combination with monitoring in a terminal window and also redirect the output to a log file. 
+
+Screen is a terminal multiplexer which means that you can start any number of virtual terminals inside the current terminal session. The advantage is, that you can detach virtual terminals so that they are running in the background. Furthermore, the processes keep still running, even if you are closing the main session or close an ssh connection if you are working remote on a server.
+Screen can be installed from the official package repositories via
+```console
+foo@bar:~$ sudo apt install screen
+```
+As an example, to redirect the output into a file named "log_run_sh.txt", the script could be started with:
+```console
+foo@bar:~/avsr1$ screen bash -c 'bash run.sh |& tee -a log_run_sh.txt'
+```
+This will start a virtual terminal session, which is executing and monitoring the run.sh file. The output is printed to this session as well as saved into the file "log_run_sh.txt". You can leave the monitoring session by simply pressing <code>ctrl+A+D</code>. If you want to return to the process, simply type 
+```console
+foo@bar:~$ screen -ls
+```
+into a terminal to see all running screen processes with their corresponding ID. Then execute
+```console
+foo@bar:~$ screen -r [ID]
+```
+to return to the process.
+Source: https://wiki.ubuntuusers.de/Screen/
+
+***
+### Literature
+
+[1] W. Yu, S. Zeiler and D. Kolossa, "Fusing Information Streams in End-to-End Audio-Visual Speech Recognition," ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), 2021, pp. 3430-3434, doi: 10.1109/ICASSP39728.2021.9414553.
+
+[2] T. Afouras, J. S. Chung, A. Senior, O. Vinyals, A. Zisserman <br>
+Deep Audio-Visual Speech Recognition  
+arXiv: 1809.02108
+
+[3] T. Afouras, J. S. Chung, A. Zisserman <br>
+LRS3-TED: a large-scale dataset for visual speech recognition  
+arXiv preprint arXiv: 1809.00496 
+
+[4] S.  Petridis,   T.  Stafylakis,   P.  Ma,   G.  Tzimiropoulos,   andM.  Pantic,    “Audio-visual  speech  recognition  with  a  hybridCTC/Attention architecture,”   in IEEE SLT. IEEE, 2018.
+
diff --git a/egs/lrs/avsr1/RESULTS.md b/egs/lrs/avsr1/RESULTS.md
new file mode 100755
index 00000000000..2615db795f8
--- /dev/null
+++ b/egs/lrs/avsr1/RESULTS.md
@@ -0,0 +1,294 @@
+## pretrain_Train_pytorch_audio_delta_specaug (Audio-Only)
+
+* Model files (archived to model.tar.gz by <code>$ pack_model.sh</code>)
+  - download link: <code>https://drive.google.com/file/d/1ITgdZoa8vQ7lDwi1jLziYGXOyUtgE2ow/view</code>
+  - training config file: <code>conf/train.yaml</code>
+  - decoding config file: <code>conf/decode.yaml</code>
+  - preprocess config file: <code>conf/specaug.yaml</code>
+  - lm config file: <code>conf/lm.yaml</code> 
+  - cmvn file: <code>data/train/cmvn.ark</code>
+  - e2e file: <code>exp/audio/model.last10.avg.best</code>
+  - e2e json file: <code>exp/audio/model.json</code>
+  - lm file: <code>exp/train_rnnlm_pytorch_lm_unigram500/rnnlm.model.best</code>
+  - lm JSON file: <code>exp/train_rnnlm_pytorch_lm_unigram500/model.json</code>
+  - dict file: <code>data/lang_char/train_unigram500_units.txt</code>
+
+## Environments
+- date: `Mon Feb 21 11:52:07 UTC 2022`
+- python version: `3.7.3 (default, Mar 27 2019, 22:11:17)  [GCC 7.3.0]`
+- espnet version: `espnet 0.6.0`
+- chainer version: `chainer 6.0.0`
+- pytorch version: `pytorch 1.0.1.post2`
+
+### CER
+
+|dataset|SNR in dB|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|---|
+|music noise|-12|171|1669|82.0|11.2|6.8|2.2|20.3|38.6|
+||-9|187|1897|87.0|8.3|4.7|0.8|13.8|33.2|
+||-6|176|1821|92.0|5.5|2.5|1.1|9.1|26.7|
+||-3|201|2096|94.4|2.2|3.3|0.2|5.8|20.4|
+||0|158|1611|95.0|3.0|2.0|0.4|5.4|19.0|
+||3|173|1710|94.7|2.7|2.6|0.4|5.7|24.9|
+||6|185|1920|96.2|1.8|2.0|0.5|4.3|17.8|
+||9|157|1533|97.6|1.0|1.4|0.5|2.9|13.4|
+||12|150|1536|96.4|1.6|2.1|0.3|4.0|20.7|
+||clean|138|1390|96.7|1.4|1.9|0.4|3.7|17.4|
+||reverb|177|1755|93.7|3.6|2.7|0.7|7.0|23.2|
+|ambient noise|-12|187|1873|76.4|16.3|7.3|2.3|25.9|51.9|
+||-9 |193|1965|84.2|10.3|5.4|1.8|17.6|40.4|
+||-6 |176|1883|90.2|5.8|4.0|1.3|11.2|26.1|
+||-3 |173|1851|91.2|4.8|4.0|1.0|9.8|32.9|
+|| 0 |148|1470|94.8|3.0|2.2|0.7|5.9|23.6|
+|| 3 |176|1718|96.0|2.1|1.9|0.3|4.3|17.0|
+|| 6 |166|1714|93.7|2.9|3.4|0.5|6.8|20.5|
+|| 9 |170|1601|96.9|1.5|1.6|0.3|3.4|18.2|
+||12 |169|1718|95.9|2.5|1.6|0.2|4.3|20.1|
+||clean |138|1390|96.7|1.4|1.9|0.4|3.7|17.4|
+||reverb |177|1755|93.7|3.6|2.7|0.7|7.0|23.2|
+
+### WER
+
+|dataset|SNR in dB|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|---|
+|music noise|-12|171|912|83.4|12.5|4.1|2.4|19.0|38.6|
+||-9 |187|1005|87.6|8.6|3.9|1.9|14.3|33.2|
+||-6 |176|951|90.6|5.9|3.5|0.8|10.2|26.7|
+||-3 |201|1097|94.4|3.3|2.3|0.6|6.2|20.4|
+|| 0 |158|847|94.9|3.2|1.9|0.4|5.4|19.0|
+|| 3 |173|884|94.2|3.8|1.9|0.6|6.3|24.9|
+|| 6 |185|997|96.3|2.7|1.0|0.7|4.4|17.8|
+|| 9 |157|817|96.9|1.7|1.3|0.4|3.4|13.4|
+||12 |150|832|95.2|2.9|1.9|0.5|5.3|20.7|
+||clean |138|739|95.7|2.4|1.9|0.4|4.7|17.4|
+||reverb |177|943|93.6|4.0|2.3|0.4|6.8|23.2|
+|ambient noise|-12|187|995|73.7|18.4|7.9|1.7|28.0|51.9|
+||-9 |193|1060|83.0|11.7|5.3|1.4|18.4|40.4|
+||-6 |176|971|90.2|6.8|3.0|1.4|11.2|26.1|
+||-3 |173|972|90.0|6.9|3.1|1.0|11.0|32.9|
+|| 0 |148|838|94.0|4.1|1.9|0.4|6.3|23.6|
+|| 3 |176|909|95.5|2.9|1.7|0.3|4.8|17.0|
+|| 6 |166|830|94.1|3.3|2.7|1.0|6.9|20.5|
+|| 9 |170|872|95.4|3.1|1.5|0.2|4.8|18.2|
+||12 |169|895|95.0|4.0|1.0|0.2|5.3|20.1|
+||clean |138|739|95.7|2.4|1.9|0.4|4.7|17.4|
+||reverb |177|943|93.6|4.0|2.3|0.4|6.8|23.2|
+
+## Train_pytorch_trainvideo_delta_specaug (Video-Only)
+
+* Model files (archived to model.tar.gz by <code>$ pack_model.sh</code>)
+  - download link: <code>https://drive.google.com/file/d/1ZXXCXSbbFS2PDlrs9kbJL9pE6-5nPPxi/view</code>
+  - training config file: <code>conf/finetunevideo/trainvideo.yaml</code>
+  - decoding config file: <code>conf/decode.yaml</code>
+  - preprocess config file: <code>conf/specaug.yaml</code>
+  - lm config file: <code>conf/lm.yaml</code> 
+  - e2e file: <code>exp/vfintune/model.last10.avg.best</code>
+  - e2e json file: <code>exp/vfintune/model.json</code>
+  - lm file: <code>exp/train_rnnlm_pytorch_lm_unigram500/rnnlm.model.best</code>
+  - lm JSON file: <code>exp/train_rnnlm_pytorch_lm_unigram500/model.json</code>
+  - dict file: <code>data/lang_char/train_unigram500_units.txt</code>
+
+## Environments
+- date: `Mon Feb 21 11:52:07 UTC 2022`
+- python version: `3.7.3 (default, Mar 27 2019, 22:11:17)  [GCC 7.3.0]`
+- espnet version: `espnet 0.6.0`
+- chainer version: `chainer 6.0.0`
+- pytorch version: `pytorch 1.0.1.post2`
+
+
+### CER
+
+|dataset|SNR in dB|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|---|
+|clean visual data|171|1669|42.3|42.5|15.2|6.4|64.1|91.8|
+||-9 |187|1897|46.4|38.8|14.8|8.5|62.2|90.9|
+||-6 |176|1821|48.1|37.7|14.2|9.2|61.1|92.0|
+||-3 |201|2096|41.7|46.4|11.9|8.9|67.2|90.0|
+|| 0 |158|1611|43.4|42.6|14.0|7.1|63.7|94.9|
+|| 3 |173|1710|49.2|37.6|13.2|8.9|59.7|91.9|
+|| 6 |185|1920|39.3|45.6|15.2|9.4|70.2|95.1|
+|| 9 |157|1533|46.2|39.1|14.7|8.5|62.3|89.2|
+||12 |150|1536|49.5|37.6|12.9|7.2|57.7|87.3|
+||clean |138|1390|44.2|42.3|13.5|7.8|63.7|92.8|
+||reverb |177|1755|44.8|41.5|13.6|7.5|62.7|92.1|
+|visual gaussian blur|-12|187|1873|37.3|46.6|16.1|9.0|71.6|93.0|
+||-9 |193|1965|43.0|44.1|13.0|11.0|68.1|93.8|
+||-6 |176|1883|39.9|43.3|16.7|7.5|67.6|93.8|
+||-3 |173|1851|43.7|43.8|12.5|8.2|64.5|91.9|
+|| 0 |148|1470|42.3|45.4|12.3|8.2|65.9|93.9|
+|| 3 |176|1718|44.8|41.5|13.7|7.9|63.1|89.2|
+|| 6 |166|1714|38.5|45.4|16.0|10.7|72.2|94.6|
+|| 9 |170|1601|45.1|42.8|12.1|11.7|66.6|91.2|
+||12 |169|1718|42.0|40.1|17.9|8.2|66.2|92.3|
+||clean |138|1390|40.4|45.5|14.2|8.7|68.3|93.5|
+||reverb |177|1755|40.2|45.6|14.2|8.5|68.3|92.7|
+|visual salt and pepper noise|-12|187|1873|36.2|48.1|15.8|9.9|73.7|92.0|
+||-9 |193|1965|41.7|44.6|13.7|10.6|68.9|92.7|
+||-6 |176|1883|36.5|47.2|16.4|8.6|72.1|93.2|
+||-3 |173|1851|42.1|45.4|12.5|10.8|68.6|92.5|
+|| 0 |148|1470|42.3|45.1|12.6|9.5|67.2|91.9|
+|| 3 |176|1718|40.0|45.1|15.0|7.6|67.6|92.0|
+|| 6 |166|1714|38.1|45.2|16.7|10.1|72.0|94.0|
+|| 9 |170|1601|40.2|45.9|13.9|12.0|71.8|92.9|
+||12 |169|1718|37.5|46.8|15.7|8.7|71.2|94.1|
+||clean |138|1390|39.9|46.0|14.0|9.1|69.1|92.8|
+||reverb |177|1755|39.9|46.2|13.9|9.1|69.2|92.7|
+
+### WER
+
+|dataset|SNR in dB|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|---|
+|clean visual data|-12|171|912|39.4|42.7|18.0|4.3|64.9|89.5|
+||-9 |187|1005|43.7|40.6|15.7|5.4|61.7|86.1|
+||-6 |176|951|43.3|42.6|14.1|4.1|60.8|88.6|
+||-3 |201|1097|41.3|44.2|14.5|5.3|64.0|85.6|
+|| 0 |158|847|44.3|37.8|17.9|6.1|61.9|85.4|
+|| 3 |173|884|44.2|39.7|16.1|5.3|61.1|84.4|
+|| 6 |185|997|38.2|44.8|17.0|3.9|65.7|84.9|
+|| 9 |157|817|47.9|37.1|15.1|5.5|57.6|80.3|
+||12 |150|832|42.9|37.6|19.5|5.3|62.4|84.0|
+||clean |138|739|45.9|39.1|15.0|5.3|59.4|85.5|
+||reverb |177|943|43.4|40.5|16.1|5.3|61.9|85.9|
+|visual Gaussian blur|-12|187|995|35.9|45.4|18.7|5.3|69.4|86.6|
+||-9 |193|1060|35.0|44.2|20.8|5.0|70.0|92.2|
+||-6 |176|971|38.2|43.2|18.6|4.6|66.4|87.5|
+||-3 |173|972|37.9|45.5|16.7|4.8|67.0|86.1|
+|| 0 |148|838|38.1|40.7|21.2|4.2|66.1|89.2|
+|| 3 |176|909|36.0|48.5|15.5|5.9|70.0|88.6|
+|| 6 |166|830|36.7|46.6|16.6|6.1|69.4|89.8|
+|| 9 |170|872|39.0|45.5|15.5|4.7|65.7|87.6|
+||12 |169|895|35.2|46.8|18.0|4.6|69.4|89.9|
+||clean |138|739|40.7|42.2|17.1|5.0|64.3|88.4|
+||reverb |177|943|38.0|44.3|17.7|5.0|67.0|89.3|
+|visual salt and pepper noise|-12|187|995|32.5|48.9|18.6|4.6|72.2|83.4|
+||-9 |193|1060|32.3|51.5|16.2|6.1|73.9|92.2|
+||-6 |176|971|36.5|47.3|16.3|7.2|70.8|86.4|
+||-3 |173|972|35.5|47.2|17.3|4.6|69.1|88.4|
+|| 0 |148|838|36.9|41.5|21.6|3.7|66.8|88.5|
+|| 3 |176|909|33.0|51.9|15.1|5.4|72.4|88.6|
+|| 6 |166|830|35.3|49.9|14.8|8.8|73.5|88.0|
+|| 9 |170|872|41.2|43.3|15.5|5.6|64.4|84.7|
+||12 |169|895|34.2|47.8|18.0|7.3|73.1|91.1|
+||clean |138|739|37.5|47.8|14.7|7.3|69.8|86.2|
+||reverb |177|943|35.9|47.9|16.1|6.7|70.7|87.0|
+
+## Train_pytorch_trainavs_delta_specaug (Audio-Visual)
+
+* Model files (archived to model.tar.gz by <code>$ pack_model.sh</code>)
+  - download link: <code>https://drive.google.com/file/d/1ZXXCXSbbFS2PDlrs9kbJL9pE6-5nPPxi/view</code>
+  - training config file: <code>conf/finetuneav/trainavs.yaml</code>
+  - decoding config file: <code>conf/decode.yaml</code>
+  - preprocess config file: <code>conf/specaug.yaml</code>
+  - lm config file: <code>conf/lm.yaml</code> 
+  - cmvn file: <code>data/train/cmvn.ark</code>
+  - e2e file: <code>exp/avfintune/model.last10.avg.best</code>
+  - e2e json file: <code>exp/avfintune/model.json</code>
+  - lm file: <code>exp/train_rnnlm_pytorch_lm_unigram500/rnnlm.model.best</code>
+  - lm JSON file: <code>exp/train_rnnlm_pytorch_lm_unigram500/model.json</code>
+  - dict file: <code>data/lang_char/train_unigram500_units.txt</code>
+
+## Environments
+- date: `Mon Feb 21 11:52:07 UTC 2022`
+- python version: `3.7.3 (default, Mar 27 2019, 22:11:17)  [GCC 7.3.0]`
+- espnet version: `espnet 0.6.0`
+- chainer version: `chainer 6.0.0`
+- pytorch version: `pytorch 1.0.1.post2`
+
+
+### CER
+
+|dataset|SNR in dB|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|---|
+|music noise with clean visual data |-12|171|1669|90.7|5.4|3.9|0.7|9.9|26.3|
+||-9 |187|1897|93.7|3.5|2.7|0.4|6.7|25.1|
+||-6 |176|1821|95.1|2.9|2.0|0.4|5.4|18.8|
+||-3 |201|2096|96.2|1.6|2.2|0.3|4.2|15.9|
+|| 0 |158|1611|96.4|1.9|1.7|0.2|3.8|13.9|
+|| 3 |173|1710|96.7|1.7|1.6|0.2|3.6|17.9|
+|| 6 |185|1920|96.1|1.6|2.2|0.5|4.3|18.9|
+|| 9 |157|1533|96.9|1.4|1.7|0.5|3.6|14.0|
+||12 |150|1536|96.5|1.4|2.1|0.5|4.0|21.3|
+||clean |138|1390|97.9|0.9|1.2|0.2|2.3|13.8|
+||reverb |177|1755|96.8|1.5|1.8|0.2|3.5|16.4|
+|ambient noise with clean visual data |-12|187|1873|89.6|5.8|4.6|1.2|11.5|31.0|
+||-9 |193|1965|91.2|5.0|3.8|0.9|9.6|29.0|
+||-6 |176|1883|94.3|1.9|3.8|0.3|6.0|21.0|
+||-3 |173|1851|94.8|2.7|2.5|0.9|6.1|22.0|
+|| 0 |148|1470|96.3|1.6|2.0|0.1|3.8|16.9|
+|| 3 |176|1718|97.7|1.5|0.8|0.1|2.4|12.5|
+|| 6 |166|1714|96.6|1.6|1.8|0.2|3.6|16.3|
+|| 9 |170|1601|97.0|1.6|1.4|0.3|3.3|17.1|
+||12 |169|1718|95.4|2.6|2.0|0.1|4.7|20.7|
+||clean |138|1390|97.9|0.9|1.2|0.2|2.3|13.8|
+||reverb |177|1755|96.8|1.5|1.8|0.2|3.5|16.4|
+|ambient noise with visual Gaussian blur|-12|187|1873|86.9|7.3|5.8|1.1|14.2|35.8|
+||-9 |193|1965|91.1|5.4|3.5|1.0|9.9|30.1|
+||-6 |176|1883|93.3|2.7|4.0|0.3|7.0|24.4|
+||-3 |173|1851|95.1|2.5|2.4|0.8|5.7|21.4|
+|| 0 |148|1470|96.3|1.6|2.1|0.1|3.8|17.6|
+|| 3 |176|1718|97.3|1.6|1.2|0.2|2.9|13.6|
+|| 6 |166|1714|96.2|1.8|2.0|0.2|4.0|18.1|
+|| 9 |170|1601|97.0|1.4|1.6|0.2|3.2|16.5|
+||12 |169|1718|94.9|2.8|2.3|0.3|5.4|23.1|
+||clean |138|1390|97.8|0.9|1.3|0.2|2.4|14.5|
+||reverb |177|1755|96.5|1.5|2.1|0.2|3.7|16.9|
+|ambient noise with visual salt and pepper noise|-12|187|1873|87.6|7.0|5.4|1.3|13.8|35.8|
+||-9 |193|1965|91.0|5.8|3.2|1.3|10.3|30.6|
+||-6 |176|1883|93.6|2.0|4.4|0.4|6.9|24.4|
+||-3 |173|1851|95.6|2.9|1.6|0.8|5.2|20.2|
+|| 0 |148|1470|95.9|1.9|2.2|0.1|4.2|18.2|
+|| 3 |176|1718|98.0|1.0|1.0|0.3|2.3|13.1|
+|| 6 |166|1714|96.4|1.8|1.8|0.2|3.7|17.5|
+|| 9 |170|1601|97.0|1.4|1.6|0.4|3.4|16.5|
+||12 |169|1718|96.2|2.2|1.6|0.2|4.1|18.9|
+||clean |138|1390|98.1|0.9|1.1|0.2|2.2|13.0|
+||reverb |177|1755|96.6|1.5|1.9|0.2|3.6|16.9|
+
+### WER
+
+|dataset|SNR in dB|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|---|
+|music noise with clean visual data |-12|171|912|91.2|6.0|2.7|1.5|10.3|26.3|
+||-9 |187|1005|93.2|4.5|2.3|0.4|7.2|25.1|
+||-6 |176|951|94.1|3.7|2.2|0.3|6.2|18.8|
+||-3 |201|1097|95.2|2.7|2.1|0.4|5.2|15.9|
+|| 0 |158|847|96.7|2.2|1.1|0.4|3.7|13.9|
+|| 3 |173|884|95.6|2.6|1.8|0.3|4.8|17.9|
+|| 6 |185|997|95.5|2.3|2.2|0.7|5.2|18.9|
+|| 9 |157|817|96.2|2.1|1.7|0.7|4.5|14.0|
+||12 |150|832|95.1|2.4|2.5|0.2|5.2|21.3|
+||clean |138|739|97.2|1.5|1.4|0.4|3.2|13.8|
+||reverb |177|943|96.0|1.8|2.2|0.3|4.3|16.4|
+|ambient noise with clean visual data |-12|187|995|90.4|6.9|2.7|1.1|10.8|31.0|
+||-9 |193|1060|91.3|5.6|3.1|1.4|10.1|29.0|
+||-6 |176|971|94.4|2.9|2.7|0.3|5.9|21.0|
+||-3 |173|972|93.7|3.7|2.6|0.1|6.4|22.0|
+|| 0 |148|838|95.7|2.0|2.3|0.1|4.4|16.9|
+|| 3 |176|909|97.0|1.5|1.4|0.3|3.3|12.5|
+|| 6 |166|830|96.0|1.9|2.0|0.6|4.6|16.3|
+|| 9 |170|872|95.6|3.4|0.9|0.2|4.6|17.1|
+||12 |169|895|94.0|3.7|2.3|0.4|6.5|20.7|
+||clean |138|739|97.2|1.5|1.4|0.4|3.2|13.8|
+||reverb |177|943|96.0|1.8|2.2|0.3|4.3|16.4|
+|ambient noise with visual Gaussian blur|-12|187|995|87.0|9.1|3.8|1.0|14.0|35.8|
+||-9 |193|1060|90.6|6.2|3.2|1.1|10.6|30.1|
+||-6 |176|971|93.2|3.6|3.2|0.3|7.1|24.4|
+||-3 |173|972|94.0|3.6|2.4|0.1|6.1|21.4|
+|| 0 |148|838|95.6|2.3|2.1|0.2|4.7|17.6|
+|| 3 |176|909|96.3|1.7|2.1|0.3|4.1|13.6|
+|| 6 |166|830|95.4|2.3|2.3|0.6|5.2|18.1|
+|| 9 |170|872|95.6|3.1|1.3|0.2|4.6|16.5|
+||12 |169|895|93.2|4.4|2.5|0.4|7.3|23.1|
+||clean |138|739|97.0|1.5|1.5|0.4|3.4|14.5|
+||reverb |177|943|95.7|1.7|2.7|0.3|4.7|16.9|
+|ambient noise with visual salt and pepper noise|-12|187|995|87.1|8.8|4.0|0.9|13.8|35.8|
+||-9 |193|1060|90.5|6.3|3.2|1.1|10.7|30.6|
+||-6 |176|971|93.3|3.2|3.5|0.3|7.0|24.4|
+||-3 |173|972|94.7|3.8|1.5|0.2|5.6|20.2|
+|| 0 |148|838|95.3|2.4|2.3|0.2|4.9|18.2|
+|| 3 |176|909|96.8|1.4|1.8|0.3|3.5|13.1|
+|| 6 |166|830|95.9|2.2|1.9|0.7|4.8|17.5|
+|| 9 |170|872|95.6|3.1|1.3|0.2|4.6|16.5|
+||12 |169|895|94.7|3.5|1.8|0.3|5.6|18.9|
+||clean |138|739|97.4|1.5|1.1|0.4|3.0|13.0|
+||average |177|943|95.8|1.9|2.3|0.4|4.7|16.9|
diff --git a/egs/lrs/avsr1/cmd.sh b/egs/lrs/avsr1/cmd.sh
new file mode 100755
index 00000000000..4d70c9c7a79
--- /dev/null
+++ b/egs/lrs/avsr1/cmd.sh
@@ -0,0 +1,89 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/gpu.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs/lrs/asr1/conf/decode.yaml b/egs/lrs/avsr1/conf/decode.yaml
old mode 100644
new mode 100755
similarity index 100%
rename from egs/lrs/asr1/conf/decode.yaml
rename to egs/lrs/avsr1/conf/decode.yaml
diff --git a/egs/lrs/avsr1/conf/fbank.conf b/egs/lrs/avsr1/conf/fbank.conf
new file mode 100755
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs/lrs/avsr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs/lrs/avsr1/conf/gpu.conf b/egs/lrs/avsr1/conf/gpu.conf
new file mode 100755
index 00000000000..6d0a75b067a
--- /dev/null
+++ b/egs/lrs/avsr1/conf/gpu.conf
@@ -0,0 +1,10 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0
+option gpu=* -l 'hostname=b1[12345678]*|c*,gpu=$0' -q g.q
\ No newline at end of file
diff --git a/egs/lrs/avsr1/conf/lm.yaml b/egs/lrs/avsr1/conf/lm.yaml
new file mode 100755
index 00000000000..94918a470ae
--- /dev/null
+++ b/egs/lrs/avsr1/conf/lm.yaml
@@ -0,0 +1,9 @@
+layer: 4
+dropout: 0
+unit: 2048
+opt: sgd        # or adam
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+batchsize: 128  # batch size in LM training
+epoch: 2      # if the data size is large, we can reduce this
+patience: 3
+maxlen: 150     # if sentence length > lm_maxlen, lm_batchsize is automatically reduced
diff --git a/egs/lrs/avsr1/conf/mfcc.conf b/egs/lrs/avsr1/conf/mfcc.conf
new file mode 100755
index 00000000000..a1aa3d6c158
--- /dev/null
+++ b/egs/lrs/avsr1/conf/mfcc.conf
@@ -0,0 +1,2 @@
+--use-energy=false   # only non-default option.
+--sample-frequency=16000
diff --git a/egs/lrs/avsr1/conf/mfcc_hires.conf b/egs/lrs/avsr1/conf/mfcc_hires.conf
new file mode 100755
index 00000000000..434834a6725
--- /dev/null
+++ b/egs/lrs/avsr1/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why 
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=40     # there is no dimensionality reduction.
+--low-freq=20     # low cutoff frequency for mel bins... this is high-bandwidth data, so
+                  # there might be some information at the low end.
+--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) 
diff --git a/egs/lrs/avsr1/conf/pitch.conf b/egs/lrs/avsr1/conf/pitch.conf
new file mode 100755
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs/lrs/avsr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs/lrs/avsr1/conf/queue.conf b/egs/lrs/avsr1/conf/queue.conf
new file mode 100755
index 00000000000..257d7b7b3aa
--- /dev/null
+++ b/egs/lrs/avsr1/conf/queue.conf
@@ -0,0 +1,10 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs/lrs/avsr1/conf/slurm.conf b/egs/lrs/avsr1/conf/slurm.conf
new file mode 100755
index 00000000000..cefd21f031d
--- /dev/null
+++ b/egs/lrs/avsr1/conf/slurm.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command sbatch --export=PATH  --ntasks-per-node=1
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
+option num_threads=1 --cpus-per-task 1  --ntasks-per-node=1 # Do not add anything to qsub_opts
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs/lrs/asr1/conf/specaug.yaml b/egs/lrs/avsr1/conf/specaug.yaml
old mode 100644
new mode 100755
similarity index 100%
rename from egs/lrs/asr1/conf/specaug.yaml
rename to egs/lrs/avsr1/conf/specaug.yaml
diff --git a/egs/lrs/avsr1/conf/train.yaml b/egs/lrs/avsr1/conf/train.yaml
new file mode 100755
index 00000000000..53fd0572132
--- /dev/null
+++ b/egs/lrs/avsr1/conf/train.yaml
@@ -0,0 +1,39 @@
+# network architecture
+# encoder related
+transformer-input-layer: conv2d
+elayers: 12
+eunits: 2048
+# decoder related
+dlayers: 6
+dunits: 2048
+# attention related
+adim: 256
+aheads: 4
+# transformer related
+model-module: "espnet.trainaudio.e2e_asr_transformer:E2E"
+
+# hybrid CTC/attention
+mtlalpha: 0.3
+
+# label smoothing
+lsm-type: unigram
+lsm-weight: 0.1
+
+# minibatch related
+batch-size: 32
+maxlen-in: 512  # if input length  > maxlen_in, batchsize is automatically reduced
+maxlen-out: 150 # if output length > maxlen_out, batchsize is automatically reduced
+
+# optimization related
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+opt: noam
+epochs: 100
+dropout-rate: 0.1
+accum-grad: 2
+grad-clip: 5
+patience: 0
+transformer-lr: 5.0
+transformer-warmup-steps: 25000
+transformer-attn-dropout-rate: 0.0
+transformer-length-normalized-loss: False
+transformer-init: pytorch
diff --git a/egs/lrs/avsr1/local/CMakeLists.txt b/egs/lrs/avsr1/local/CMakeLists.txt
new file mode 100644
index 00000000000..107f5c1e76d
--- /dev/null
+++ b/egs/lrs/avsr1/local/CMakeLists.txt
@@ -0,0 +1,248 @@
+cmake_minimum_required (VERSION 3.8)
+set(CMAKE_CXX_STANDARD 17)
+
+project(OpenFace VERSION 2.0.2)
+
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin/)
+
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/modules/")
+
+set(CMAKE_CONFIG_DIR etc/OpenFace)
+set(CONFIG_DIR "${CMAKE_INSTALL_PREFIX}/${CMAKE_CONFIG_DIR}")
+add_definitions(-DCONFIG_DIR="${CONFIG_DIR}")
+
+# make sure we'll use OpenBLAS only: there's a header file naming difference between different
+# implementations; so OpenFace wants OpenBLAS;
+find_package(Threads)
+find_package(OpenBLAS REQUIRED)
+if ( ${OpenBLAS_FOUND} )
+    MESSAGE("OpenBLAS information:")
+    MESSAGE("  OpenBLAS_LIBRARIES: ${OpenBLAS_LIB}")
+else()
+    MESSAGE(FATAL_ERROR "OpenBLAS not found in the system.")
+endif()
+
+if ( ${OpenBLAS_INCLUDE_FOUND} )
+    MESSAGE("  OpenBLAS_INCLUDE: ${OpenBLAS_INCLUDE_DIR}")
+else()
+    MESSAGE(WARNING "OpenBLAS include not found in the system. Using the one vended with OpenFace.")
+	set(OpenBLAS_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/lib/3rdParty/OpenBLAS/include")
+    MESSAGE("  OpenBLAS_INCLUDE: ${OpenBLAS_INCLUDE_DIR}")
+endif()
+
+find_package( OpenCV 4.0 REQUIRED COMPONENTS core imgproc calib3d highgui objdetect)
+if(${OpenCV_FOUND})
+	MESSAGE("OpenCV information:") 
+	MESSAGE("  OpenCV_INCLUDE_DIRS: ${OpenCV_INCLUDE_DIRS}") 
+	MESSAGE("  OpenCV_LIBRARIES: ${OpenCV_LIBRARIES}") 
+	MESSAGE("  OpenCV_LIBRARY_DIRS: ${OpenCV_LINK_DIRECTORIES}") 
+else()
+    MESSAGE(FATAL_ERROR "OpenCV not found in the system.")
+endif()
+
+find_package( Boost 1.5.9 COMPONENTS filesystem system)
+if(${Boost_FOUND})
+	MESSAGE("Boost information:") 
+	MESSAGE("  Boost_VERSION: ${Boost_VERSION}")
+	MESSAGE("  Boost_INCLUDE_DIRS: ${Boost_INCLUDE_DIRS}") 
+	MESSAGE("  Boost_LIBRARIES: ${Boost_LIBRARIES}") 
+	MESSAGE("  Boost_LIBRARY_DIRS: ${Boost_LIBRARY_DIRS}") 
+else()
+    MESSAGE("Boost not found in the system.")
+endif()
+
+
+# Move LandmarkDetector model
+file(GLOB files "lib/local/LandmarkDetector/model/*.txt")
+foreach(file ${files})
+	file(COPY ${file} DESTINATION ${CMAKE_BINARY_DIR}/bin/model)
+	install(FILES ${file} DESTINATION ${CMAKE_CONFIG_DIR}/model)
+endforeach()
+
+# Move the hierarchical LandmarkDetector models
+file(GLOB files "lib/local/LandmarkDetector/model/model*")
+foreach(file ${files})
+	file(COPY ${file} DESTINATION ${CMAKE_BINARY_DIR}/bin/model)
+	install(DIRECTORY ${file} DESTINATION ${CMAKE_CONFIG_DIR}/model)
+endforeach()
+
+# Move detection validation models
+file(GLOB files "lib/local/LandmarkDetector/model/detection_validation/*.txt")
+foreach(file ${files})
+	file(COPY ${file} DESTINATION ${CMAKE_BINARY_DIR}/bin/model/detection_validation)
+	install(FILES ${file} DESTINATION ${CMAKE_CONFIG_DIR}/model/detection_validation)
+endforeach()
+
+# Move patch experts
+file(GLOB files "lib/local/LandmarkDetector/model/patch_experts/*.txt")
+foreach(file ${files})
+	file(COPY ${file} DESTINATION ${CMAKE_BINARY_DIR}/bin/model/patch_experts)
+	install(FILES ${file} DESTINATION ${CMAKE_CONFIG_DIR}/model/patch_experts)
+endforeach()
+
+# Move CEN patch experts
+file(GLOB files "lib/local/LandmarkDetector/model/patch_experts/*.dat")
+foreach(file ${files})
+	file(COPY ${file} DESTINATION ${CMAKE_BINARY_DIR}/bin/model/patch_experts)
+	install(FILES ${file} DESTINATION ${CMAKE_CONFIG_DIR}/model/patch_experts)
+endforeach()
+
+# Move MTCNN face detector
+file(GLOB files "lib/local/LandmarkDetector/model/mtcnn_detector/*.txt")
+foreach(file ${files})
+	file(COPY ${file} DESTINATION ${CMAKE_BINARY_DIR}/bin/model/mtcnn_detector)
+	install(FILES ${file} DESTINATION ${CMAKE_CONFIG_DIR}/model/mtcnn_detector)
+endforeach()
+
+# Move MTCNN face detector
+file(GLOB files "lib/local/LandmarkDetector/model/mtcnn_detector/*.dat")
+foreach(file ${files})
+	file(COPY ${file} DESTINATION ${CMAKE_BINARY_DIR}/bin/model/mtcnn_detector)
+	install(FILES ${file} DESTINATION ${CMAKE_CONFIG_DIR}/model/mtcnn_detector)
+endforeach()
+
+# Move Point Distribution models
+file(GLOB files "lib/local/LandmarkDetector/model/pdms/*.txt")
+foreach(file ${files})
+	file(COPY ${file} DESTINATION ${CMAKE_BINARY_DIR}/bin/model/pdms)
+	install(FILES ${file} DESTINATION ${CMAKE_CONFIG_DIR}/model/pdms)
+endforeach()
+
+# Move OpenCV classifiers
+file(GLOB files "lib/3rdParty/OpenCV3.4/classifiers/*.xml")
+foreach(file ${files})
+	file(COPY ${file} DESTINATION ${CMAKE_BINARY_DIR}/bin/classifiers)
+	install(FILES ${file} DESTINATION ${CMAKE_CONFIG_DIR}/classifiers)
+endforeach()
+
+# Move AU prediction modules
+file(GLOB files "lib/local/FaceAnalyser/AU_predictors/*.txt")
+foreach(file ${files})
+	file(COPY ${file} DESTINATION ${CMAKE_BINARY_DIR}/bin/AU_predictors)
+	install(FILES ${file} DESTINATION ${CMAKE_CONFIG_DIR}/AU_predictors)
+endforeach()
+
+# Move AU prediction modules
+file(GLOB files "lib/local/FaceAnalyser/AU_predictors/svr*")
+foreach(file ${files})
+	file(COPY ${file} DESTINATION ${CMAKE_BINARY_DIR}/bin/AU_predictors)
+	install(DIRECTORY ${file} DESTINATION ${CMAKE_CONFIG_DIR}/AU_predictors)
+endforeach()
+
+# Move AU prediction modules
+file(GLOB files "lib/local/FaceAnalyser/AU_predictors/svm*")
+foreach(file ${files})
+	file(COPY ${file} DESTINATION ${CMAKE_BINARY_DIR}/bin/AU_predictors)
+	install(DIRECTORY ${file} DESTINATION ${CMAKE_CONFIG_DIR}/AU_predictors)
+endforeach()
+
+if (${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU")
+    execute_process(COMMAND ${CMAKE_CXX_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
+    if (GCC_VERSION VERSION_LESS 8.0)
+		MESSAGE(FATAL_ERROR "Need a 8.0 or newer GCC compiler. Current GCC: ${GCC_VERSION}")
+    else ()
+        set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse -msse2 -msse3")
+    endif ()
+endif ()
+
+# dlib
+find_package(dlib 19.13)
+if(${dlib_FOUND})
+    message("dlib information:")
+    message("  dlib version: ${dlib_VERSION}")
+
+    if (NOT TARGET dlib)
+        add_library(dlib INTERFACE IMPORTED GLOBAL)
+    endif()
+else()
+    message(FATAL_ERROR "dlib not found in the system, please install dlib")
+endif()
+
+# suppress auto_ptr deprecation warnings
+if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
+    add_compile_options("-Wno-deprecated-declarations")
+endif()
+
+# LandmarkDetector library
+add_subdirectory(lib/local/LandmarkDetector)
+# Facial Expression analysis library
+add_subdirectory(lib/local/FaceAnalyser)
+# Gaze estimation library
+add_subdirectory(lib/local/GazeAnalyser)
+# Utilities library
+add_subdirectory(lib/local/Utilities)
+
+# test if this file is a top list file
+# thus we're building an OpenFace as a standalone
+# project; otherwise OpenFace is being built as a
+# part or larger tree;
+if(CMAKE_CURRENT_SOURCE_DIR STREQUAL "${CMAKE_SOURCE_DIR}")
+
+    # for a standalone builds - allow installing package configs;
+    message(STATUS "Standalone mode detected; Enabling configuration/targets export.")
+
+    # export libraries for reuse
+    include(CMakePackageConfigHelpers)
+
+    set(LIB_INSTALL_DIR lib)
+    set(CONFIG_DEST_DIR ${LIB_INSTALL_DIR}/cmake/OpenFace/)
+    set(OpenFace_LIBRARIES OpenFace::GazeAnalyser OpenFace::FaceAnalyser OpenFace::LandmarkDetector OpenFace::Utilities)
+
+    # export targets [build tree]
+    export(EXPORT OpenFaceTargets
+        NAMESPACE OpenFace::
+        FILE "${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_DEST_DIR}/OpenFaceTargets.cmake")
+
+    # write package version file
+    write_basic_package_version_file(
+        "${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_DEST_DIR}/OpenFaceConfigVersion.cmake"
+        COMPATIBILITY AnyNewerVersion)
+
+    # define [build tree] bindir relative include dir
+    foreach(lib ${OpenFace_LIBRARIES})
+      if(TARGET ${lib})
+        get_target_property(libname ${lib} "NAME")
+        file(RELATIVE_PATH rel_incdir ${CMAKE_CURRENT_BINARY_DIR} "${CMAKE_CURRENT_SOURCE_DIR}/lib/local/${libname}/include")
+        list(APPEND OPENFACE_INCLUDE_DIRS ${rel_incdir})
+      endif()
+    endforeach()
+    list(REMOVE_DUPLICATES OPENFACE_INCLUDE_DIRS)
+
+    # write package config file from template [build tree]
+    # all PATH_VARS should be relative to a ${CMAKE_CURRENT_BINARY_DIR}
+    # as it's the "prefix" of our non installed package in the build tree
+    configure_package_config_file(cmake/OpenFaceConfig.cmake.in
+        "${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_DEST_DIR}/OpenFaceConfig.cmake"
+        INSTALL_DESTINATION ${CONFIG_DEST_DIR}
+        PATH_VARS OPENFACE_INCLUDE_DIRS)
+
+    # store current build dir in the CMake package registry
+    # export(PACKAGE OpenFace)
+
+    # install exported targets [install tree]
+    install(EXPORT OpenFaceTargets
+        FILE OpenFaceTargets.cmake
+        NAMESPACE OpenFace::
+        DESTINATION ${CONFIG_DEST_DIR})
+
+    # redefine [install tree] prefix relative include dir
+    set(OPENFACE_INCLUDE_DIRS "include/OpenFace")
+
+    # write package config file from template [install tree]
+    configure_package_config_file(cmake/OpenFaceConfig.cmake.in
+        "${CMAKE_CURRENT_BINARY_DIR}/OpenFace/OpenFaceConfig.cmake"
+        INSTALL_DESTINATION ${CONFIG_DEST_DIR}
+        PATH_VARS OPENFACE_INCLUDE_DIRS)
+
+    # install package configs
+    install(FILES
+        "${CMAKE_CURRENT_BINARY_DIR}/OpenFace/OpenFaceConfig.cmake"
+        "${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_DEST_DIR}/OpenFaceConfigVersion.cmake"
+        DESTINATION ${CONFIG_DEST_DIR})
+endif()
+
+# executables
+add_subdirectory(exe/FaceLandmarkImg)
+add_subdirectory(exe/FaceLandmarkVid)
+add_subdirectory(exe/FaceLandmarkVidMulti)
+add_subdirectory(exe/FeatureExtraction)
diff --git a/egs/lrs/avsr1/local/download.sh b/egs/lrs/avsr1/local/download.sh
new file mode 100644
index 00000000000..8d17609a9b2
--- /dev/null
+++ b/egs/lrs/avsr1/local/download.sh
@@ -0,0 +1,13 @@
+#! /usr/bin/env bash 
+
+# Copyright 2020 Ruhr-University (Wentao Yu)
+
+. ./cmd.sh
+. ./path.sh
+
+git clone https://github.com/rub-ksv/lrs_avsr1_local.git
+for file in data_prepare dump extract_reliability training; do
+	cp -R lrs_avsr1_local/$file local
+done
+rm -rf lrs_avsr1_local
+exit 0
diff --git a/egs/lrs/avsr1/local/installpackage.sh b/egs/lrs/avsr1/local/installpackage.sh
new file mode 100755
index 00000000000..066a396101a
--- /dev/null
+++ b/egs/lrs/avsr1/local/installpackage.sh
@@ -0,0 +1,82 @@
+#! /usr/bin/env bash 
+
+#! /usr/bin/env bash 
+
+# Copyright 2020 Ruhr-University (Wentao Yu)
+
+. ./cmd.sh
+. ./path.sh
+
+# hand over parameters 
+OPENFACE_DIR=$1			# Path to OpenFace build directory
+VIDAUG_DIR=$2 			# Path to vidaug directory
+DEEPXI_DIR=$3			# DeepXi directory
+
+conda install -n espnet_venv tensorflow tqdm pysoundfile boost
+conda install -n espnet_venv dlib pythran-openblas==0.3.6 opencv-python
+conda install -c esri tensorflow-addons
+
+mkdir -p local/installations
+if [ -d "$OPENFACE_DIR" ] ; then
+    echo "OpenFace already installed."
+else
+    while true
+    do
+        read -r -p "Have you already installed OpenFace on your computer [Y/n] " input
+        case $input in 
+	    [yY][eE][sS]|[yY])
+		echo "Please path OpenFace directory"
+		exit 1;
+	        ;;
+            [nN][oO]|[nN])
+		cd local/installations
+		$MAIN_ROOT/tools/installers/install_openface.sh || exit 1;
+		cd ../..
+     		break
+		;;
+        esac      
+    done
+fi
+
+if [ -d "$VIDAUG_DIR" ] ; then
+    echo "Vidaug already installed."
+else
+    while true
+    do
+        read -r -p "Have you already installed Vidaug on your computer [Y/n] " input
+        case $input in 
+	    [yY][eE][sS]|[yY])
+		echo "Please path Vidaug directory"
+		exit 1;
+	        ;;
+            [nN][oO]|[nN])
+		cd local/installations
+		$MAIN_ROOT/tools/installers/install_vidaug.sh $MAIN_ROOT || exit 1;
+		cd ../..
+     		break
+		;;
+       	 esac      
+    done
+fi
+
+if [ -d "$DEEPXI_DIR" ] ; then
+    echo "DeepXi already installed."
+else
+    while true
+    do
+        read -r -p "Have you already installed DeepXi on your computer [Y/n] " input
+        case $input in 
+	    [yY][eE][sS]|[yY])
+		echo "Please path DeepXi directory"
+		exit 1;
+	        ;;
+            [nN][oO]|[nN])
+		cd local/installations
+		$MAIN_ROOT/tools/installers/install_deepxi.sh || exit 1;
+		cd ../..
+     		break
+		;;
+        esac      
+    done
+fi
+exit 0
diff --git a/egs/lrs/avsr1/local/se_batch.py b/egs/lrs/avsr1/local/se_batch.py
new file mode 100755
index 00000000000..c5f0a58bf6b
--- /dev/null
+++ b/egs/lrs/avsr1/local/se_batch.py
@@ -0,0 +1,61 @@
+""" AUTHOR:         Aaron Nicolson
+AFFILIATION:    Signal Processing Laboratory, Griffith University.
+
+This Source Code Form is subject to the terms of the Mozilla Public
+License, v. 2.0. If a copy of the MPL was not distributed with this
+file, You can obtain one at http://mozilla.org/MPL/2.0/."""
+
+from deepxi.utils import read_wav
+import glob
+import numpy as np
+import os
+
+
+def Batch(fdir, snr_l=[]):
+    """REQUIRES REWRITING. WILL BE MOVED TO deepxi/utils.py
+
+    Places all of the test waveforms from the list into a numpy array.
+    SPHERE format cannot be used. 'glob' is used to support Unix style pathname
+    pattern expansions. Waveforms are padded to the maximum waveform length. The
+    waveform lengths are recorded so that the correct lengths can be sliced
+    for feature extraction. The SNR levels of each test file are placed into a
+    numpy array. Also returns a list of the file names.
+
+    Inputs:
+            fdir - directory containing the waveforms.
+            fnames - filename/s of the waveforms.
+            snr_l - list of the SNR levels used.
+
+    Outputs:
+            wav_np - matrix of paded waveforms stored as a numpy array.
+            len_np - length of each waveform strored as a numpy array.
+            snr_test_np - numpy array of all the SNR levels for the test set.
+            fname_l - list of filenames.
+
+    """
+    fname_l = []  # list of file names.
+    wav_l = []  # list for waveforms.
+    snr_test_l = []  # list of SNR levels for the test set.
+    # if isinstance(fnames, str): fnames = [fnames] # if string, put into list.
+    fnames = ["*.wav", "*.flac", "*.mp3"]
+    for fname in fnames:
+        for fpath in glob.glob(os.path.join(fdir, fname)):
+            for snr in snr_l:
+                if fpath.find("_" + str(snr) + "dB") != -1:
+                    snr_test_l.append(snr)  # append SNR level.
+            (wav, _) = read_wav(fpath)  # read waveform from given file path.
+            if len(wav.shape) == 2:
+                wav = wav[:, 0]
+            if np.isnan(wav).any() or np.isinf(wav).any():
+                raise ValueError("Error: NaN or Inf value.")
+            wav_l.append(wav)  # append.
+            fname_l.append(os.path.basename(os.path.splitext(fpath)[0]))  # append name.
+    len_l = []  # list of the waveform lengths.
+    maxlen = max(len(wav) for wav in wav_l)  # maximum length of waveforms.
+    wav_np = np.zeros(
+        [len(wav_l), maxlen], np.int16
+    )  # numpy array for waveform matrix.
+    for (i, wav) in zip(range(len(wav_l)), wav_l):
+        wav_np[i, : len(wav)] = wav  # add waveform to numpy array.
+        len_l.append(len(wav))  # append length of waveform to list.
+    return wav_np, np.array(len_l, np.int32), np.array(snr_test_l, np.int32), fname_l
diff --git a/egs/lrs/avsr1/local/show_result.sh b/egs/lrs/avsr1/local/show_result.sh
new file mode 100755
index 00000000000..35f5915cfbf
--- /dev/null
+++ b/egs/lrs/avsr1/local/show_result.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+mindepth=0
+maxdepth=1
+
+. utils/parse_options.sh
+
+if [ $# -gt 2 ]; then
+    echo "Usage: $0 --mindepth 0 --maxdepth 1 [exp]" 1>&2
+    echo ""
+    echo "Show the system environments and the evaluation results in Markdown format."
+    echo 'The default of <exp> is "exp/".'
+    exit 1
+fi
+
+[ -f ./path.sh ] && . ./path.sh
+set -euo pipefail
+if [ $# -eq 1 ]; then
+    exp=$1
+else
+    exp=$1
+    savedir=$2
+fi
+
+
+cat << EOF
+<!-- Generated by $0 -->
+# RESULTS
+## Environments
+- date: \`$(LC_ALL=C date)\`
+EOF
+
+python << EOF
+import sys, espnet, chainer, torch
+pyversion = sys.version.replace('\n', ' ')
+
+print(f"""- python version: \`{pyversion}\`
+- espnet version: \`espnet {espnet.__version__}\`
+- chainer version: \`chainer {chainer.__version__}\`
+- pytorch version: \`pytorch {torch.__version__}\`""")
+EOF
+
+cat << EOF
+- Git hash: \`$(git rev-parse HEAD)\`
+  - Commit date: \`$(git log -1 --format='%cd')\`
+
+EOF
+
+while IFS= read -r expdir; do
+    if ls ${expdir}/decode_*/result.txt &> /dev/null; then
+    # 1. Show the result table
+    cat << EOF
+## $(basename ${expdir})
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+EOF
+        grep -e Avg ${expdir}/decode_*/result.txt \
+            | sed -e "s#${expdir}/\([^/]*\)/result.txt:#|\1#g" \
+            | sed -e 's#Sum/Avg##g' | tr '|' ' ' | tr -s ' ' '|'
+        echo
+
+        # 2. Show the result table for WER
+        if ls ${expdir}/decode_*/result.wrd.txt &> /dev/null; then
+            cat << EOF
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+EOF
+            grep -e Avg ${expdir}/decode_*/result.wrd.txt \
+                | sed -e "s#${expdir}/\([^/]*\)/result.wrd.txt:#|\1#g" \
+                | sed -e 's#Sum/Avg##g' | tr '|' ' ' | tr -s ' ' '|'
+            echo
+        fi
+    fi
+done < <(find ${exp} -mindepth ${mindepth} -maxdepth ${maxdepth} -type d) >>$savedir
diff --git a/egs/lrs/avsr1/path.sh b/egs/lrs/avsr1/path.sh
new file mode 100755
index 00000000000..aa33934494e
--- /dev/null
+++ b/egs/lrs/avsr1/path.sh
@@ -0,0 +1,18 @@
+MAIN_ROOT=$PWD/../../..
+KALDI_ROOT=$MAIN_ROOT/tools/kaldi
+
+
+
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/featbin:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$MAIN_ROOT/tools/chainer_ctc/ext/warp-ctc/build
+. "${MAIN_ROOT}"/tools/activate_python.sh && . "${MAIN_ROOT}"/tools/extra_path.sh
+export PATH=$MAIN_ROOT/utils:$MAIN_ROOT/espnet/bin:$PATH
+
+export OMP_NUM_THREADS=1
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
diff --git a/egs/lrs/avsr1/run.sh b/egs/lrs/avsr1/run.sh
new file mode 100755
index 00000000000..6e295ad5290
--- /dev/null
+++ b/egs/lrs/avsr1/run.sh
@@ -0,0 +1,1050 @@
+#!/usr/bin/env bash
+
+# Copyright 2020 Ruhr-University (Wentao Yu)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+
+
+# general configuration
+ifpretrain=true			# if use LRS2 pretrain set 
+iflrs3pretrain=true		# if use LRS3 pretrain set
+ifsegment=true  		# if do segmentation for pretrain set
+ifcuda=true  			# if use cuda
+ifmulticore=true        	# if multi cpu processing, default is true in all scripts
+num=  			# this variable is related with next variable. Only applies when ifdebug=true
+ifdebug=false	   		# with debug, we only use $num Utts from pretrain and $num Utts from Train set
+backend=pytorch
+stage=-1			# start from -1 if you need to start from data download
+stop_stage=100			# stage at which to stop
+dataprocessingstage=0		# stage for data processing in stage 3
+stop_dataprocessingstage=100	# stage at which to stop
+ngpu=1       			# number of gpus ("0" uses cpu, otherwise use gpu)
+nj=16
+debugmode=1
+dumpdir=dump   			# directory to dump full features
+N=0            			# number of minibatches to be used (mainly for debugging). "0" uses all minibatches.
+verbose=0      			# verbose option
+train_lm=false			# true: Train own language model, false: use pretrained librispeech LM model
+
+# Setting path variables for dataset, OpenFace, DeepXi, pretrained model and musan
+# Change this variables and adapt it to your Folder structure
+DATA_DIR=					# The LRS2 dataset directory e.g. "/home/foo/LRS2"
+DATALRS3_DIR=				# The LRS3 dataset directory e.g. "/home/foo/LRS3"
+PRETRAINEDMODEL=pretrainedvideomodel/Video_only_model.pt 				        # Path to pretrained video model e.g. "pretrainedvideomodel/Video_only_model.pt"
+MUSAN_DIR="musan"   					              	#  The noise dataset directory e.g. "musan" 
+
+# feature configuration
+do_delta=false
+
+preprocess_config=conf/specaug.yaml
+train_config=conf/train.yaml 
+lm_config=conf/lm.yaml
+decode_config=conf/decode.yaml
+
+# rnnlm related
+lm_resume=        # specify a snapshot file to resume LM training
+lmtag=            # tag for managing LMs
+
+# bpemode (unigram or bpe)
+nbpe=500
+bpemode=unigram
+
+# exp tag
+tag="" # tag for managing experiments.
+
+. utils/parse_options.sh || exit 1;
+
+## Function for pretrained Librispeech language model:  
+function gdrive_download () {
+    CONFIRM=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate \
+        "https://docs.google.com/uc?export=download&id=$1" -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')
+    wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$CONFIRM&id=$1" -O $2
+    rm -rf /tmp/cookies.txt
+}
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+# define sets
+if [ "$ifpretrain" = true ] ; then
+	train_set="pretrain_Train"
+else
+	train_set="Train"
+fi
+train_dev="Val"
+recog_set="Val Test"
+
+
+
+
+# Stage -1: download local folder 
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    # download required files for data processing
+    local/download.sh
+fi
+
+# Stage 0: install software
+OPENFACE_DIR=local/installations/OpenFace/build/bin	# Path to OpenFace build directory
+VIDAUG_DIR=local/installations/vidaug 		 	# Path to vidaug directory
+DEEPXI_DIR=local/installations/DeepXi 			# DeepXi directory
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # Install required softwares
+    local/installpackage.sh $OPENFACE_DIR $VIDAUG_DIR $DEEPXI_DIR
+fi
+
+# Stage 1: Data preparation
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    ### Task dependent. You have to make data the following preparation part by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+    echo "stage 1: Data preparation"
+
+    echo "Download pretrained video feature extractor and check directory configuration"
+    if [ -f "$PRETRAINEDMODEL" ] ; then
+	echo "pretrained video feature extractor already exists"
+    else
+        gdrive_download '1ITgdZoa8vQ7lDwi1jLziYGXOyUtgE2ow' 'model.v1.tar.gz'  || exit 1;
+        tar -xf model.v1.tar.gz  || exit 1;
+        mv model.v1/avsrlrs2_3/pretrainedvideomodel ./
+        rm -rf model.v1
+        rm -rf model.v1.tar.gz
+    fi
+
+    if [ -d "$DATA_DIR" ] ; then
+	echo "Dataset already exists."
+    else
+	echo "For downloading the data, please visit 'https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html'."
+	echo "You will need to sign a Data Sharing agreement with BBC Research & Development before getting access."
+	echo "Please download the dataset by yourself and save the dataset directory in path.sh file"
+	echo "Thanks!"
+    fi
+	
+    if [ "$iflrs3pretrain" = true ] ; then
+    	if [ -d "$DATALRS3_DIR" ]; then
+    	    echo "Dataset already exists."
+    	else
+    	    echo "For downloading the data, please visit 'https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs3.html'."
+    	    echo "You will need to sign a Data Sharing agreement with BBC Research & Development before getting access."
+    	    echo "Please download the dataset by yourself and save the dataset directory in path.sh file"
+    	    echo "Thanks!"
+    	fi
+    fi
+
+    # Create Musan directory
+    if [ -d "${MUSAN_DIR}" ]; then
+	echo "MUSAN dataset is in ${MUSAN_DIR}..."
+    else
+	echo "Download MUSAN dataset"
+	wget --no-check-certificate http://www.openslr.org/resources/17/musan.tar.gz
+	echo "Download finished"
+	echo "Unzip MUSAN dataset"
+	tar -xf musan.tar.gz
+ 	rm -rf musan.tar.gz
+	echo "Unzipping finished"
+    fi   
+    # Create RIRS_NOISES Dataset
+    if [ -d "RIRS_NOISES" ]; then
+	echo "RIRS_NOISES dataset is in RIRS_NOISES..."
+    else
+    	# Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises
+	echo "Download RIRS_NOISES dataset"
+    	wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip
+	echo "Download finished"
+	echo "Unzip RIRS_NOISES dataset"
+    	unzip rirs_noises.zip
+    	rm -rf rirs_noises.zip
+	echo "Unzipping finished"
+    fi
+
+    for part in Test Val Train; do 
+        # use underscore-separated names in data directories. #Problem: Filelist_Val is readonly
+        local/data_prepare/lrs2_audio_data_prep.sh ${DATA_DIR} $part $ifsegment $ifmulticore $ifdebug $num $nj || exit 1;
+    done
+    if [ "$ifpretrain" = true ] ; then
+    	part=pretrain
+    	local/data_prepare/lrs2_audio_data_prep.sh ${DATA_DIR} $part $ifsegment $ifmulticore $ifdebug $num $nj || exit 1;
+    fi
+
+    if [ "$iflrs3pretrain" = true ] ; then
+
+	## embedding LRS3 code
+    	python3 -m venv --system-site-packages ./LRS3-env
+    	source ./LRS3-env/bin/activate
+    	pip3 install pydub
+    	local/data_prepare/lrs3_audio_data_prep.sh $DATALRS3_DIR pretrain $ifmulticore $ifsegment $ifdebug $num
+    	deactivate
+    	rm -rf ./LRS3-env
+        mkdir -p data/audio/clean/LRS3/pretrain
+	mv Dataset_processing/LRS3/kaldi/pretrainsegment/* data/audio/clean/LRS3/pretrain
+	cp Dataset_processing/LRS3/audio/pretrain/Filelist_pretrain Dataset_processing/LRS3/audio/pretrain/Filelist_LRS3pretrain
+	mv Dataset_processing/LRS3/audio/pretrain/Filelist_LRS3pretrain data/METADATA
+    fi
+    echo "stage 1: Data preparation finished"
+
+fi
+
+# Stage 2: Audio augmentation
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    ### Task dependent. You have to make data the following preparation part by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+    echo "stage 2: Audio augmentation"
+    for part in Test Val Train; do 
+        # use underscore-separated names in data directories.
+        local/extract_reliability/audio_augmentation.sh $MUSAN_DIR $part LRS2 || exit 1;
+    done
+
+    if [ "$ifpretrain" = true ] ; then
+    	part=pretrain
+    	local/extract_reliability/audio_augmentation.sh $MUSAN_DIR $part LRS2 || exit 1;
+    fi
+    if [ "$iflrs3pretrain" = true ] ; then
+    	part=pretrain
+    	local/extract_reliability/audio_augmentation.sh $MUSAN_DIR $part LRS3 || exit 1;
+    fi
+    # The Test set is augmented with ambient and music noise SNR from -12 to 12
+    local/extract_reliability/audio_augmentation_recog.sh $MUSAN_DIR Test LRS2 || exit 1;
+    echo "Datasets Combination"
+    if [[ "$ifpretrain" = true || "$iflrs3pretrain" = true ]] ; then ## combine pretrain and train set
+ 	if [[ "$ifpretrain" = true && "$iflrs3pretrain" = false ]] ; then
+		utils/combine_data.sh data/audio/augment/pretrain_Train_aug \
+					data/audio/augment/LRS2_Train_aug \
+					data/audio/augment/LRS2_pretrain_aug || exit 1;
+		utils/combine_data.sh data/audio/augment/pretrain_aug \
+					data/audio/augment/LRS2_pretrain_aug || exit 1;
+	elif [[ "$ifpretrain" = false && "$iflrs3pretrain" = true ]] ; then
+		utils/combine_data.sh data/audio/augment/pretrain_Train_aug \
+					data/audio/augment/LRS2_Train_aug \
+					data/audio/augment/LRS3_pretrain_aug || exit 1;
+		utils/combine_data.sh data/audio/augment/pretrain_aug \
+					data/audio/augment/LRS3_pretrain_aug || exit 1;
+	elif [[ "$ifpretrain" = true && "$iflrs3pretrain" = true ]] ; then
+		utils/combine_data.sh data/audio/augment/pretrain_Train_aug \
+					data/audio/augment/LRS2_Train_aug \
+					data/audio/augment/LRS2_pretrain_aug \
+					data/audio/augment/LRS3_pretrain_aug  || exit 1;
+		utils/combine_data.sh data/audio/augment/pretrain_aug \
+					data/audio/augment/LRS2_pretrain_aug \
+					data/audio/augment/LRS3_pretrain_aug || exit 1;
+	fi
+    fi
+    mv data/audio/augment/LRS2_Test_aug data/audio/augment/Test_aug
+    mv data/audio/augment/LRS2_Val_aug data/audio/augment/Val_aug
+    mv data/audio/augment/LRS2_Train_aug data/audio/augment/Train_aug	
+
+    echo "stage 2: Audio augmentation finished"
+
+fi
+
+mp3files=Dataset_processing/Audioaugments
+feat_tr_dir=${dumpdir}/audio_org/${train_set}/delta${do_delta}; mkdir -p ${feat_tr_dir}
+# Stage 3: Feature Generation for audio features
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    echo "stage 3: Feature Generation"
+    echo "stage 3.1: Make augmented mp3 files"
+    mkdir -p $mp3files
+    if [ "$ifpretrain" = false ] && [ "$iflrs3pretrain" = false ] ; then
+        for part in Test Val Train; do
+	    echo "Run audioaugwav frames for ${part} set!" 
+ 	    mkdir -p ${mp3files}/$part
+            local/extract_reliability/audioaugwav.sh data/audio/augment/${part}_aug $mp3files/$part || exit 1;
+        done
+
+    else
+	for part in Test Val Train pretrain; do #Train pretrain 
+	    echo "Run audioaugwav frames for ${part} set!" 
+	    mkdir -p $mp3files/$part
+    	    local/extract_reliability/audioaugwav.sh data/audio/augment/${part}_aug $mp3files/$part || exit 1;
+        done
+
+	part=pretrain
+        python3 local/extract_reliability/segaugaudio.py $mp3files data/audio/augment $part $ifmulticore
+	rm -r ${mp3files:?}/${part:?}
+	mv ${mp3files}/${part}_aug $mp3files/${part}
+    fi
+    nameambient=noise
+    namemusic=music
+    name_list="${nameambient} ${namemusic}"
+    for name in ${name_list};do
+        dset=Test
+     	mkdir -p ${mp3files}/${dset}_${name}  || exit 1;
+	local/extract_reliability/audioaugwav.sh data/audio/augment/LRS2_decode/${dset}_aug_${name} $mp3files/${dset}_${name} || exit 1;
+    done
+    echo "stage 3.1: Make augmented mp3 files finished"
+
+    ### Task dependent. You have to design training and dev sets by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+    echo "stage 3.2: Feature Generation"
+
+    fbankdir=fbank
+    mfccdir=mfccs
+    if [[ "$ifpretrain" = true || "$iflrs3pretrain" = true ]] ; then ## combine pretrain and train set
+	# Generate the fbank and mfcc features; by default 80-dimensional fbanks with pitch on each frame
+
+	mv data/audio/augment/pretrain_aug/segments data/audio/augment/pretrain_aug/segments_old
+	mv data/audio/augment/pretrain_Train_aug/segments data/audio/augment/pretrain_Train_aug/segments_old
+	for x in pretrain Train Test Val; do #pretrain_Train pretrain Train
+	    mv data/audio/augment/${x}_aug/wav.scp data/audio/augment/${x}_aug/wavnew.scp
+	    python3 local/extract_reliability/remakewav.py data/audio/augment/${x}_aug/wavnew.scp data/audio/augment/${x}_aug/wav.scp Dataset_processing/Audioaugments/$x
+	    cp -R data/audio/augment/${x}_aug data/audio/augment/${x}mfccs_aug
+	    mv data/audio/augment/${x}_aug data/audio/augment/${x}fbank_aug
+	    steps/make_mfcc.sh \
+		--cmd "$train_cmd" \
+		--nj $nj \
+		--write_utt2num_frames true \
+			data/audio/augment/${x}mfccs_aug \
+			exp/make_mfcc/${x} \
+			${mfccdir}  || exit 1;
+            utils/fix_data_dir.sh data/audio/augment/${x}mfccs_aug  || exit 1;
+	    steps/make_fbank_pitch.sh \
+		--cmd "$train_cmd" \
+		--nj $nj \
+		--write_utt2num_frames true \
+			data/audio/augment/${x}fbank_aug \
+			exp/make_fbank/${x} \
+			${fbankdir}  || exit 1;
+            utils/fix_data_dir.sh data/audio/augment/${x}fbank_aug  || exit 1;
+	done
+
+	utils/combine_data.sh data/audio/augment/pretrain_Trainfbank_aug \
+					data/audio/augment/pretrainfbank_aug \
+					data/audio/augment/Trainfbank_aug  || exit 1;
+	utils/combine_data.sh data/audio/augment/pretrain_Trainmfccs_aug \
+					data/audio/augment/pretrainmfccs_aug \
+					data/audio/augment/Trainmfccs_aug || exit 1;
+    else
+        # Generate the fbank and mfcc features; by default 80-dimensional fbanks with pitch on each frame
+	for x in Train Val Test; do #
+	    cp -R data/audio/augment/${x}_aug data/audio/augment/${x}mfccs_aug
+	    mv data/audio/augment/${x}_aug data/audio/augment/${x}fbank_aug
+	    steps/make_mfcc.sh \
+		--cmd "$train_cmd" \
+		--nj $nj \
+		--write_utt2num_frames true \
+			data/audio/augment/${x}mfccs_aug \
+			exp/make_mfcc/${x} \
+			${mfccdir}  || exit 1;
+	    utils/fix_data_dir.sh data/audio/augment/${x}mfccs_aug  || exit 1;
+	    steps/make_fbank_pitch.sh \
+		--cmd "$train_cmd" \
+		--nj $nj \
+		--write_utt2num_frames true \
+			data/audio/augment/${x}fbank_aug \
+			exp/make_fbank/${x} \
+			${fbankdir}  || exit 1;
+	    utils/fix_data_dir.sh data/audio/augment/${x}fbank_aug  || exit 1;
+	done
+    fi 
+
+    ## make fband and mfcc features for test decode dataset
+    x=Test
+    nameambient=noise
+    namemusic=music
+    name_list="${nameambient} ${namemusic}"
+    for name in ${name_list};do
+        rm -rf data/audio/augment/LRS2_decode/${x}mfccs_aug_${name}
+        rm -rf data/audio/augment/LRS2_decode/${x}fbank_aug_${name}
+	cp -R data/audio/augment/LRS2_decode/${x}_aug_${name} data/audio/augment/LRS2_decode/${x}mfccs_aug_${name}  || exit 1;
+	mv data/audio/augment/LRS2_decode/${x}_aug_${name} data/audio/augment/LRS2_decode/${x}fbank_aug_${name}  || exit 1;
+	steps/make_mfcc.sh \
+		--cmd "$train_cmd" \
+		--nj $nj \
+		--write_utt2num_frames true \
+		  data/audio/augment/LRS2_decode/${x}mfccs_aug_${name} \
+		  exp/make_mfcc/${x}_${name} ${mfccdir}  || exit 1;
+        utils/fix_data_dir.sh data/audio/augment/LRS2_decode/${x}mfccs_aug_${name}  || exit 1;
+	steps/make_fbank_pitch.sh \
+		--cmd "$train_cmd" \
+		--nj $nj \
+		--write_utt2num_frames true \
+		  data/audio/augment/LRS2_decode/${x}fbank_aug_${name} \
+		  exp/make_fbank/${x}_${name} ${fbankdir}  || exit 1;
+        utils/fix_data_dir.sh data/audio/augment/LRS2_decode/${x}fbank_aug_${name}  || exit 1;
+     done
+
+    # compute global CMVN
+    compute-cmvn-stats scp:data/audio/augment/${train_set}fbank_aug/feats.scp data/audio/augment/${train_set}fbank_aug/cmvn.ark  || exit 1;
+
+    # dump features
+    dump.sh  \
+	--cmd "$train_cmd" \
+	--nj $nj \
+	--do_delta ${do_delta} \
+	  data/audio/augment/${train_set}fbank_aug/feats.scp \
+	  data/audio/augment/${train_set}fbank_aug/cmvn.ark \
+	  exp/dump_feats/${train_set}fbank_aug ${feat_tr_dir}  || exit 1;
+
+    for rtask in ${recog_set} Train pretrain; do
+        feat_recog_dir=${dumpdir}/audio_org/${rtask}/delta${do_delta}; mkdir -p ${feat_recog_dir}
+        dump.sh \
+	    --cmd "$train_cmd" \
+            --nj $nj \
+            --do_delta ${do_delta} data/audio/augment/${rtask}fbank_aug/feats.scp \
+              data/audio/augment/${train_set}fbank_aug/cmvn.ark \
+              exp/dump_feats/recog/${rtask} \
+              ${feat_recog_dir}  || exit 1;
+    done
+
+    # make dump file for Test decode File
+    nameambient=noise
+    namemusic=music
+    name_list="${nameambient} ${namemusic}"
+    for name in ${name_list};do
+        feat_recog_dir=${dumpdir}/audio_org/Test_decode_${name}/delta${do_delta}; mkdir -p ${feat_recog_dir}
+        dump.sh \
+	    --cmd "$train_cmd" \
+            --nj $nj \
+            --do_delta ${do_delta} data/audio/augment/LRS2_decode/Testfbank_aug_${name}/feats.scp \
+              data/audio/augment/${train_set}fbank_aug/cmvn.ark \
+              exp/dump_feats/recog/Test_${name} \
+              ${feat_recog_dir}  || exit 1;
+    done
+
+    echo "stage 3.2: Audio Feature Generation finished"
+    echo "stage 3: Feature Generation finished"
+fi
+
+
+dict=data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt
+bpemodel=data/lang_char/${train_set}_${bpemode}${nbpe}
+echo "dictionary: ${dict}"
+# Stage 4: Dictionary and JSON Data Preparation
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    ### Task dependent. You have to check non-linguistic symbols used in the corpus.
+    echo "stage 4: Dictionary and Json Data Preparation"
+    if [ "$train_lm" = true ] ; then
+        mkdir -p data/lang_char/
+     	echo "<unk> 1" > ${dict} # <unk> must be 1, 0 will be used for "blank" in CTC
+    	cut -f 2- -d" " data/${train_set}/text > data/lang_char/input.txt
+    	spm_train --input=data/lang_char/input.txt \
+	    --vocab_size=${nbpe} \
+	    --model_type=${bpemode} \
+	    --model_prefix=${bpemodel} \
+	    --input_sentence_size=100000000  || exit 1;
+    	spm_encode --model=${bpemodel}.model \
+	    --output_format=piece < data/lang_char/input.txt | tr ' ' '\n' | sort | uniq | awk '{print $0 " " NR+1}' >> ${dict}  || exit 1;
+    	wc -l ${dict}
+    else
+    	# if using external librispeech lm        
+	gdrive_download '1ITgdZoa8vQ7lDwi1jLziYGXOyUtgE2ow' 'model.v1.tar.gz'  || exit 1;
+        tar -xf model.v1.tar.gz  || exit 1;
+	mv model.v1/avsrlrs2_3/exp/train_rnnlm_pytorch_lm_unigram500 exp/train_rnnlm_pytorch_lm_unigram500
+	mv model.v1/avsrlrs2_3/data/lang_char data/
+	mv data/lang_char/train_unigram500.model data/lang_char/${train_set}_unigram500.model
+    	mv data/lang_char/train_unigram500.vocab data/lang_char/${train_set}_unigram500.vocab
+    	mv data/lang_char/train_unigram500_units.txt data/lang_char/${train_set}_unigram500_units.txt
+	rm -rf model.v1
+	rm -rf model.v1.tar.gz
+
+    	##### it is depands on your corpus, if the corpus text transcription is uppercase, use this to convert to lowercase
+
+    	textfilenames="data/audio/augment/*/text"
+    	textdecodefilenames="data/audio/augment/LRS2_decode/*/text"
+    	textcleanfilenames="data/audio/clean/*/*/text"
+	for textname in $textfilenames $textdecodefilenames $textcleanfilenames; do
+    	    for textfilename in $textname
+    	    do
+	    	sed -r 's/([^ \t]+\s)(.*)/\1\L\2/' $textfilename > ${textfilename}1  || exit 1;
+	    	rm -rf $textfilename  || exit 1;
+	    	mv ${textfilename}1 $textfilename  || exit 1;
+    	    done
+   	done
+    fi
+
+    # make json labels
+    data2json.sh --feat ${feat_tr_dir}/feats.scp --bpecode ${bpemodel}.model \
+         data/audio/augment/${train_set}fbank_aug ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.json  || exit 1;
+    for rtask in ${recog_set} Train pretrain; do
+	sed -r 's/([^ \t]+\s)(.*)/\1\L\2/' data/audio/augment/${rtask}fbank_aug/text > data/audio/augment/${rtask}fbank_aug/text1  || exit 1;
+    	rm -rf data/audio/augment/${rtask}fbank_aug/text  || exit 1;
+    	mv data/audio/augment/${rtask}fbank_aug/text1 data/audio/augment/${rtask}fbank_aug/text  || exit 1;
+        
+	feat_recog_dir=${dumpdir}/audio_org/${rtask}/delta${do_delta}
+        data2json.sh --feat ${feat_recog_dir}/feats.scp --bpecode ${bpemodel}.model \
+            data/audio/augment/${rtask}fbank_aug ${dict} > ${feat_recog_dir}/data_${bpemode}${nbpe}.json  || exit 1;
+    done
+
+    ###make dump file for Test decode File
+    nameambient=noise
+    namemusic=music
+    name_list="${nameambient} ${namemusic}"
+    for name in ${name_list};do
+	feat_recog_dir=${dumpdir}/audio_org/Test_decode_${name}/delta${do_delta}
+        data2json.sh --feat ${feat_recog_dir}/feats.scp --bpecode ${bpemodel}.model \
+            data/audio/augment/LRS2_decode/Testfbank_aug_${name} ${dict} > ${feat_recog_dir}/data_${bpemode}${nbpe}.json  || exit 1;
+    done
+
+    echo "stage 4: Dictionary and Json Data Preparation finished"
+fi
+
+
+# Define new paths
+facerecog=Dataset_processing/Facerecog
+videoframe=Dataset_processing/Videodata
+videoaug=Dataset_processing/Videoaug
+videofeature=Dataset_processing/Videofeature
+SNRdir=Dataset_processing/SNRsmat
+SNRptdir=Dataset_processing/SNRs
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    echo "stage 5: Extract reliability measures"
+    if [ ${dataprocessingstage} -le 0 ] && [ ${stop_dataprocessingstage} -ge 0 ]; then
+	#make mfcc dump file
+	mkdir -p ${dumpdir}/mfcc/${train_set}/delta${do_delta}/  || exit 1;
+        cp data/audio/augment/${train_set}mfccs_aug/feats.scp ${dumpdir}/mfcc/${train_set}/delta${do_delta}/  || exit 1;
+    	data2json.sh --feat ${dumpdir}/mfcc/${train_set}/delta${do_delta}/feats.scp \
+	   --bpecode ${bpemodel}.model data/audio/augment/${train_set}mfccs_aug ${dict} \
+	   > ${dumpdir}/mfcc/${train_set}/delta${do_delta}/data_${bpemode}${nbpe}.json  || exit 1;
+    	for rtask in ${recog_set} Train pretrain; do
+            feat_recog_dir=${dumpdir}/mfcc/${rtask}/delta${do_delta}
+     	    mkdir -p $feat_recog_dir  || exit 1;
+	    cp data/audio/augment/${rtask}mfccs_aug/feats.scp ${dumpdir}/mfcc/${rtask}/delta${do_delta}/  || exit 1;
+            data2json.sh --feat ${feat_recog_dir}/feats.scp \
+	        --bpecode ${bpemodel}.model data/audio/augment/${rtask}mfccs_aug ${dict} \
+		  > ${dumpdir}/mfcc/${rtask}/delta${do_delta}/data_${bpemode}${nbpe}.json  || exit 1;
+        done
+
+        nameambient=noise
+        namemusic=music
+        name_list="${nameambient} ${namemusic}"
+        for name in ${name_list};do
+    	    dset=Test
+            feat_recog_dir=${dumpdir}/mfcc/Test_decode_${name}/delta${do_delta}
+     	    mkdir -p $feat_recog_dir  || exit 1;
+	    cp data/audio/augment/LRS2_decode/Testmfccs_aug_${name}/feats.scp ${dumpdir}/mfcc/Test_decode_${name}/delta${do_delta}/  || exit 1;
+            data2json.sh --feat ${feat_recog_dir}/feats.scp \
+	 	--bpecode ${bpemodel}.model data/audio/augment/LRS2_decode/Testmfccs_aug_${name} ${dict} \
+		  > ${dumpdir}/mfcc/Test_decode_${name}/delta${do_delta}/data_${bpemode}${nbpe}.json  || exit 1;
+
+            done
+    fi
+
+    if [ ${dataprocessingstage} -le 1 ] && [ ${stop_dataprocessingstage} -ge 1 ]; then
+	#Stage 5.1: Video augmentation with Gaussian blur and salt&pepper noise
+	if [ -d vidaug ]; then
+  	    echo "vidaug already exist..."
+	else
+  	    ln -s $VIDAUG_DIR vidaug
+	    ln -rsf local/extract_reliability/videoaug.py  vidaug/videoaug.py  
+	fi
+	python3 vidaug/videoaug.py data/METADATA/Filelist_Test $DATA_DIR $videoaug blur	# video augmentation with Gaussian blur
+	python3 vidaug/videoaug.py data/METADATA/Filelist_Test $DATA_DIR $videoaug saltandpepper # video augmentation with salt and pepper noise
+	unlink ./vidaug
+    fi
+
+    if [ ${dataprocessingstage} -le 2 ] && [ ${stop_dataprocessingstage} -ge 2 ]; then
+	#Stage 5.2: Video stream processing, using OpenFace for face recognition
+    	echo "stage 5.2: OpenFace face recognition"
+	mkdir -p $facerecog
+    	for part in Test Val Train; do  #
+	    echo "Starting OpenFace background processes for ${part} set!"  
+ 	    mkdir -p $facerecog/LRS2${part}
+            local/extract_reliability/Openface.sh $DATA_DIR $facerecog/LRS2${part} $part $OPENFACE_DIR \
+				LRS2 $nj $ifdebug || exit 1;
+    	done
+    	if [ "$ifpretrain" = true ] ; then
+    	    part=pretrain
+    	    echo "Starting OpenFace background processes for ${part} set!"  
+	    mkdir -p $facerecog/LRS2${part}
+    	    local/extract_reliability/Openface.sh $DATA_DIR $facerecog/LRS2${part} $part $OPENFACE_DIR \
+				LRS2 $nj $ifdebug || exit 1;
+   	fi
+    	if [ "$iflrs3pretrain" = true ] ; then
+    	    part=pretrain
+    	    echo "Starting OpenFace background processes for LRS3 ${part} set!"  
+	    mkdir -p $facerecog/LRS3${part}
+    	    local/extract_reliability/Openface.sh $DATALRS3_DIR $facerecog/LRS3${part} $part $OPENFACE_DIR \
+				LRS3 $nj $ifdebug || exit 1;
+   	fi
+	part=Test
+    	for noisetype in blur saltandpepper; do 
+	    echo "Starting OpenFace background processes for ${part} set!"  
+ 	    mkdir -p $facerecog/LRS2${part}_$noisetype
+            local/extract_reliability/Openface_vidaug.sh $videoaug $facerecog/LRS2${part}_$noisetype \
+				$part $OPENFACE_DIR LRS2 $noisetype $nj $ifdebug || exit 1;
+    	done
+
+	echo "All OpenFace background processes for all sets are done!"
+    fi
+
+    if [ ${dataprocessingstage} -le 3 ] && [ ${stop_dataprocessingstage} -ge 3 ]; then	
+	# Stage 5.3: Extract Video frames from the MP4 File by using OpenFace results
+	echo "stage 5.3: Extract Frames"
+	mkdir -p $videoframe
+
+    	if [ "$ifpretrain" = true ] ; then
+    	    part=pretrain
+	    echo "Extracting frames for ${part} set!" 
+	    mkdir -p $videoframe/LRS2${part}
+    	    local/extract_reliability/extractframs.sh $DATA_DIR \
+			$videoframe \
+			$facerecog \
+			data/audio/clean/LRS2 \
+			$part \
+			LRS2 \
+			$ifsegment \
+			$ifmulticore || exit 1;
+   	fi
+
+    	if [ "$iflrs3pretrain" = true ] ; then
+    	    part=pretrain
+	    echo "Extracting frames for ${part} set!" 
+	    mkdir -p $videoframe/LRS3${part}
+    	    local/extract_reliability/extractframs.sh $DATALRS3_DIR \
+			$videoframe \
+			$facerecog \
+			data/audio/clean/LRS3 \
+			$part \
+			LRS3 \
+			$ifsegment \
+			$ifmulticore || exit 1;
+   	fi
+
+	for part in Test Val Train; do  # Test 
+    	    echo "Extracting frames for ${part} set!"  	
+ 	    mkdir -p $videoframe/LRS2${part}
+            local/extract_reliability/extractframs.sh $DATA_DIR \
+			$videoframe \
+			$facerecog \
+			data/audio/clean/LRS2 \
+			$part \
+			LRS2 \
+			$ifsegment \
+			$ifmulticore || exit 1;
+    	done
+	part=Test
+    	for noisetype in blur saltandpepper; do 
+	    echo "Extracting frames for augumented ${part} set!" 
+	    mkdir -p $videoframe/LRS2${part}_$noisetype
+    	    local/extract_reliability/extractframs.sh $videoaug \
+			$videoframe \
+			$facerecog \
+			data/audio/clean/LRS2 \
+			$part \
+			LRS2 \
+			$ifsegment \
+			$ifmulticore \
+			$noisetype || exit 1;
+	done
+	echo "Extract Frames finished"	
+    fi
+
+    if [ ${dataprocessingstage} -le 4 ] && [ ${stop_dataprocessingstage} -ge 4 ]; then
+        # Stage 5.4: Use DeepXi to estimate SNR
+        echo "stage 5.4: Estimate SNRs using DeepXi framework"
+	if [ -d DeepXi ]; then
+  	    echo "Deepxi already exist..."
+	else
+  	    ln -s $DEEPXI_DIR DeepXi
+	fi
+	rm -rf DeepXi/set/test_noisy_speech
+	rm -rf DeepXi/deepxi/se_batch.py
+	cp local/se_batch.py DeepXi/deepxi
+        if [ "$ifpretrain" = false ] && [ "$iflrs3pretrain" = false ] ; then
+	    for part in Test Val Train; do
+		echo "Extract SNR for ${part} set!"
+ 		mkdir -p $SNRdir/$part
+                mkdir -p $SNRptdir/$part
+        	local/extract_reliability/extractsnr.sh $SNRdir $SNRptdir $mp3files $part $ifmulticore || exit 1;
+    	    done
+    	else	
+	    for part in Train pretrain Test Val; do  
+		echo "Extract SNR for ${part} set!" 
+		mkdir -p $SNRdir/$part
+                mkdir -p $SNRptdir/$part
+    		local/extract_reliability/extractsnr.sh $SNRdir $SNRptdir $mp3files $part $ifmulticore || exit 1;
+	    done
+   	fi
+	nameambient=noise
+        namemusic=music
+        name_list="${nameambient} ${namemusic}"
+        for name in ${name_list};do
+    	    dset=Test
+ 	    mkdir -p $SNRdir/${dset}_${name}
+     	    mkdir -p $SNRptdir/${dset}_${name}  || exit 1;
+	    local/extract_reliability/extractsnr.sh $SNRdir $SNRptdir $mp3files ${dset}_${name} $ifmulticore || exit 1;
+        done
+
+	# Clean Up DeepXi: unlink and rm DeepXi
+        unlink ./DeepXi
+	rm -rf $SNRdir
+    fi
+
+    if [ ${dataprocessingstage} -le 5 ] && [ ${stop_dataprocessingstage} -ge 5 ]; then
+	# Extract video features from video frames, if it is necessary
+	echo "stage 5.5: Extract video features"
+	mkdir -p $videofeature
+	for part in Test Val; do
+	    echo "Extract video features for ${part} set!"
+	    mkdir -p $videofeature/LRS2${part}
+	    local/extract_reliability/extractfeatures.sh $videoframe/LRS2${part}/Pics \
+				$videofeature/LRS2${part} \
+				$PRETRAINEDMODEL \
+				$part \
+				$ifcuda \
+				$ifdebug || exit 1;
+	done
+
+    	if [ "$ifpretrain" = true ] ; then
+	    part=pretrain
+	    echo "Extract video features for ${part} set!"
+	    mkdir -p $videofeature/LRS2${part}
+	    local/extract_reliability/extractfeatures.sh $videoframe/LRS2${part}/Pics \
+				$videofeature/LRS2${part} \
+				$PRETRAINEDMODEL \
+				$part \
+				$ifcuda \
+				$ifdebug || exit 1;
+   	fi
+
+    	if [ "$iflrs3pretrain" = true ] ; then
+	    part=pretrain
+	    echo "Extract video features for ${part} set!"
+	    mkdir -p $videofeature/LRS3${part}
+	    local/extract_reliability/extractfeatures.sh $videoframe/LRS3${part}/Pics \
+				$videofeature/LRS3${part} \
+				$PRETRAINEDMODEL \
+				$part \
+				$ifcuda \
+				$ifdebug || exit 1;
+   	fi
+	part=Test
+    	for noisetype in blur saltandpepper; do 
+	    echo "Extract video features for augmented ${part} set!"
+	    mkdir -p $videofeature/LRS2${part}_$noisetype
+    	    local/extract_reliability/extractfeatures.sh $videoframe/LRS2${part}_$noisetype/Pics \
+			$videofeature/LRS2${part}_$noisetype \
+			$PRETRAINEDMODEL \
+			$part \
+			$ifcuda \
+			$ifdebug || exit 1;
+	done
+    fi
+
+    if [ ${dataprocessingstage} -le 6 ] && [ ${stop_dataprocessingstage} -ge 6 ]; then	
+	# Make video ark files 
+	echo "stage 5.6: Make video ark files"
+
+	rm -rf data/video
+	python3 local/extract_reliability/tensor2ark.py $videofeature data/video $nj
+	for part in Test Val; do
+	    echo "Make video dump files for LRS2 ${part} set!"
+	    cat data/video/LRS2${part}/feats_*.scp > data/video/LRS2${part}/feats.scp || exit 1;
+            sort data/video/LRS2${part}/feats.scp -o data/video/LRS2${part}/feats.scp
+   	    mkdir -p ${dumpdir}/video/${part} || exit 1;
+	    for files in text wav.scp utt2spk; do
+		cp data/audio/clean/LRS2/${part}/${files} data/video/LRS2${part} || exit 1;
+	    done
+	    utils/fix_data_dir.sh data/video/LRS2${part}  || exit 1;
+	    cp data/video/LRS2${part}/feats.scp ${dumpdir}/video/${part} || exit 1;
+	    data2json.sh --feat ${dumpdir}/video/${part}/feats.scp --bpecode ${bpemodel}.model \
+         			data/video/LRS2${part} ${dict} > ${dumpdir}/video/${part}/data_${bpemode}${nbpe}.json  || exit 1;
+	done
+
+	if [[ "$ifpretrain" = true || "$iflrs3pretrain" = true ]] ; then
+	    part=pretrain
+ 	    if [[ "$ifpretrain" = true && "$iflrs3pretrain" = false ]] || [[ "$ifpretrain" = false && "$iflrs3pretrain" = true ]]; then
+		if [[ "$ifpretrain" = true && "$iflrs3pretrain" = false ]] ; then
+		    dataset=LRS2
+		elif [[ "$ifpretrain" = false && "$iflrs3pretrain" = true ]] ; then
+		    dataset=LRS3
+		fi
+		echo "Make video dump files for ${dataset} ${part} set!"
+		mkdir -p data/video/${part}
+	        cat data/video/${part}/feats_*.scp > data/video/${part}/feats.scp || exit 1;
+		sort data/video/${part}/feats.scp -o data/video/${part}/feats.scp
+   	        mkdir -p ${dumpdir}/video/${part} || exit 1;
+		for files in text wav.scp utt2spk; do
+		    cp data/audio/clean/${dataset}/${part}/${files} data/video/${part} || exit 1;
+		done
+		utils/fix_data_dir.sh data/video/${part}  || exit 1;
+		cp data/video/${part}/${part}/feats.scp ${dumpdir}/video/${part} || exit 1;
+	    elif [[ "$ifpretrain" = true && "$iflrs3pretrain" = true ]] ; then
+		echo "Make video dump files for LRS2 and LRS3 ${part} set!"
+	        cat data/video/LRS2${part}/feats_*.scp > data/video/LRS2${part}/feats.scp || exit 1;
+	        cat data/video/LRS3${part}/feats_*.scp > data/video/LRS3${part}/feats.scp || exit 1;
+		mkdir -p data/video/${part}
+   		mkdir -p ${dumpdir}/video/${part} || exit 1;
+		for files in text wav.scp utt2spk; do
+		    cat data/audio/clean/LRS2/${part}/${files} data/audio/clean/LRS3/${part}/${files} > data/video/${part}/${files} || exit 1;
+		    sort data/video/${part}/${files} -o data/video/${part}/${files}
+  		done
+		utils/fix_data_dir.sh data/video/${part}  || exit 1;
+		cat data/video/LRS2${part}/feats.scp data/video/LRS3${part}/feats.scp > ${dumpdir}/video/${part}/feats.scp || exit 1;
+		sort ${dumpdir}/video/${part}/feats.scp -o ${dumpdir}/video/${part}/feats.scp
+	    fi
+
+	    data2json.sh --feat ${dumpdir}/video/${part}/feats.scp --bpecode ${bpemodel}.model \
+         			data/video/${part} ${dict} > ${dumpdir}/video/${part}/data_${bpemode}${nbpe}.json || exit 1;
+
+	fi
+
+	part=Test
+    	for noisetype in blur saltandpepper; do 
+	    echo "Make video dump files for augmented ${part} set!"
+	    cat data/video/LRS2${part}_${noisetype}/feats_*.scp > data/video/LRS2${part}_${noisetype}/feats.scp || exit 1;
+ 	    sort data/video/LRS2${part}_${noisetype}/feats.scp -o data/video/LRS2${part}_${noisetype}/feats.scp
+   	    mkdir -p ${dumpdir}/video/${part}_decode_${noisetype} || exit 1;
+	    for files in text wav.scp utt2spk; do
+		cp data/audio/clean/LRS2/${part}/${files} data/video/LRS2${part}_${noisetype} || exit 1;
+	    done
+	    utils/fix_data_dir.sh data/video/LRS2${part}_${noisetype}  || exit 1;
+	    cp data/video/LRS2${part}_${noisetype}/feats.scp ${dumpdir}/video/${part}_decode_${noisetype} || exit 1;
+	    data2json.sh --feat ${dumpdir}/video/${part}_decode_${noisetype}/feats.scp --bpecode ${bpemodel}.model \
+         			data/video/LRS2${part}_${noisetype} ${dict} > ${dumpdir}/video/${part}_decode_${noisetype}/data_${bpemode}${nbpe}.json \
+                                || exit 1;
+	done
+    fi
+
+    if [ ${dataprocessingstage} -le 7 ] && [ ${stop_dataprocessingstage} -ge 7 ]; then
+	# Remake dump files
+	echo "stage 5.7: Remake audio and video dump files"
+
+	for dset in pretrain_Train Val Test Test_decode_music Test_decode_noise; do 
+            rm -rf dump/audio/$dset
+	    python3 local/dump/audiodump.py dump/audio dump/audio_org $dset $ifmulticore || exit 1;
+        done
+
+	for dset in pretrain Val Test; do 
+ 	    rm -rf dump/avpretrain/$dset
+	    python3 local/dump/avpretraindump.py dump/avpretrain dump/audio_org dump/video \
+						$SNRptdir $videoframe dump/mfcc \
+						$dset $ifmulticore || exit 1;
+        done
+
+	for dset in Train Val Test; do 
+            rm -rf dump/avtrain/$dset
+	    python3 local/dump/avtraindump.py dump/avtrain dump/audio_org $videofeature \
+						$SNRptdir $videoframe dump/mfcc \
+						$dset $ifmulticore || exit 1;
+        done
+
+	# Creat video dump file
+	for dset in pretrain Val Test; do 
+	    rm -rf dump/videopretrain/$dset
+	    python3 local/dump/videodump.py dump/avpretrain dump/videopretrain $dset || exit 1;
+        done
+
+	for dset in Train Val Test; do 
+	    rm -rf dump/videotrain/$dset
+	    python3 local/dump/videodump.py dump/avtrain dump/videotrain $dset || exit 1;
+        done
+
+	dset=Test
+	rm -rf dump/avpretraindecode
+	rm -rf dump/avtraindecode
+	for noisecombination in 'noise_None' 'music_None' 'noise_blur' 'noise_saltandpepper'; do 
+	    python3 local/dump/avpretraindecodedump.py dump/avpretraindecode dump/audio_org dump/video \
+				$SNRptdir $videoframe dump/mfcc \
+				$dset $noisecombination $ifmulticore || exit 1;
+	    python3 local/dump/avtraindecodedump.py dump/avtraindecode dump/audio_org dump/video \
+				$videofeature $SNRptdir $videoframe dump/mfcc \
+				$dset $noisecombination $ifmulticore || exit 1;
+	done
+
+    fi
+
+    if [ ${dataprocessingstage} -le 8 ] && [ ${stop_dataprocessingstage} -ge 8 ]; then
+	echo "stage 5.8: Split Test decode dump files"
+	for audionoise in noise music; do
+	    python3 local/extract_reliability/extractsnr.py data/audio/augment/LRS2_decode $audionoise $ifmulticore || exit 1;
+        done
+	for noisecombination in 'noise_None' 'music_None' 'noise_blur' 'noise_saltandpepper'; do 
+	    python3 local/extract_reliability/splitsnr.py dump/avpretraindecode $noisecombination data/audio/augment/LRS2_decode || exit 1;
+	    python3 local/extract_reliability/splitsnr.py dump/avtraindecode $noisecombination data/audio/augment/LRS2_decode || exit 1;
+	done
+    fi
+	
+    echo "stage 5: Reliability measures generation finished"
+fi
+
+# It takes a few days. If you just want to end-to-end ASR without LM,
+# you can skip this and remove --rnnlm option in the recognition (stage 6)
+# Otherwise, the pretrained Librispeech LM can be used (train_lm=false)
+
+if [ "$train_lm" = false ] ; then
+    lmexpname=train_rnnlm_pytorch_lm_unigram500
+    lmexpdir=exp/${lmexpname}
+else
+    if [ -z ${lmtag} ]; then
+        lmtag=$(basename ${lm_config%.*})
+    fi
+    lmexpname=train_rnnlm_${backend}_${lmtag}_${bpemode}${nbpe}_ngpu${ngpu}
+    lmexpdir=exp/${lmexpname}
+    mkdir -p ${lmexpdir}
+fi
+
+# Stage 6: Language Model (LM) preparation
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+    if [ "$train_lm" = false ] ; then
+        echo "stage 6: Use pretrained LM"
+    else
+        echo "stage 6: LM Preparation"
+        lmdatadir=data/local/lm_train_${bpemode}${nbpe}
+        # use external data
+        if [ ! -e data/local/lm_train/librispeech-lm-norm.txt.gz ]; then
+	    echo "Download Librispeech normnalized language model (LM) training text"
+            wget http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz -P data/local/lm_train/
+	    echo "Download finished"
+        fi
+		
+        if [ ! -e ${lmdatadir} ]; then
+	    echo "Prepare LM data"
+            mkdir -p ${lmdatadir}
+	    # build gzip archive for language data out of the utterances in the LRS dataset
+            cut -f 2- -d" " data/${train_set}/text | gzip -c > data/local/lm_train/${train_set}_text.gz
+            # combine external text and transcriptions and shuffle them with seed 777
+            zcat data/local/lm_train/librispeech-lm-norm.txt.gz data/local/lm_train/${train_set}_text.gz |\
+                spm_encode \
+                    --model=${bpemodel}.model \
+                    --output_format=piece \
+                > ${lmdatadir}/train.txt
+            cut -f 2- -d" " data/audio/augment/${train_dev}fbank_aug/text | \
+                spm_encode \
+                    --model=${bpemodel}.model \
+                    --output_format=piece \
+                > ${lmdatadir}/valid.txt
+	    echo "Preparation step done"
+        fi
+        echo "Start training Language Model"
+        ${cuda_cmd} --gpu ${ngpu} ${lmexpdir}/train.log \
+            lm_train.py \
+            --config ${lm_config} \
+            --ngpu ${ngpu} \
+            --backend ${backend} \
+            --verbose 1 \
+            --outdir ${lmexpdir} \
+            --tensorboard-dir tensorboard/${lmexpname} \
+            --train-label ${lmdatadir}/train.txt \
+            --valid-label ${lmdatadir}/valid.txt \
+            --resume ${lm_resume} \
+            --dict ${dict} \
+            --dump-hdf5-path ${lmdatadir}
+        echo "stage 6: LM Preparation finished"
+    fi
+fi
+
+if [ -z ${tag} ]; then
+    expname=${train_set}_${backend} #_$(basename ${train_config%.*})
+    if ${do_delta}; then
+        expname=${expname}_delta
+    fi
+    if [ -n "${preprocess_config}" ]; then 
+	expname=${expname}_$(basename ${preprocess_config%.*}) 
+    fi
+else
+    expname=${train_set}_${backend}_${tag}
+fi
+
+# ToDo: Hand over parameters for subscripts
+if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+    echo "Stage 7: Network Training"
+    # train audio model
+    expdirapretrain=exp/pretrain/A
+    mkdir -p ${expdirapretrain}
+    echo ${expdirapretrain}
+    noisetype=noise 	# Which noise type data is used for decoding, possible noisetype: noise music blur and saltandpepper
+    local/training/train_audio.sh --backend $backend \
+				--ngpu $ngpu \
+				--debugmode $debugmode \
+				--N $N \
+ 				--verbose $verbose \
+				--nbpe $nbpe \
+				--bpemode $bpemode \
+				--nj $nj \
+				--do_delta $do_delta \
+			 	--train_set $train_set \
+				--train_dev $train_dev \
+				--preprocess_config $preprocess_config \
+				--train_config $train_config\
+				--lm_config $lm_config \
+				--decode_config $decode_config\
+				$expdirapretrain dump/audio dump/avpretraindecode $lmexpdir $noisetype $dict $bpemodel || exit 1;
+
+    # pretrain video model
+    expdirvpretrain=exp/pretrain/V
+    mkdir -p ${expdirvpretrain}
+    echo ${expdirvpretrain}
+    noisetype=blur 	# Which noise type data is used for decoding, possible noisetype: noise music blur and saltandpepper
+    local/training/pretrain_video.sh --backend $backend \
+				--ngpu $ngpu \
+				--debugmode $debugmode \
+				--N $N \
+ 				--verbose $verbose \
+				--nbpe $nbpe \
+				--bpemode $bpemode \
+				--nj $nj \
+				--do_delta $do_delta \
+				--preprocess_config $preprocess_config \
+				--train_config $train_config\
+				--lm_config $lm_config \
+				--decode_config $decode_config\
+				 $expdirvpretrain dump/videopretrain dump/avpretraindecode $lmexpdir $noisetype $dict $bpemodel  || exit 1;
+
+    # finetune video model
+    expdirvfine=exp/fine/V
+    mkdir -p ${expdirvfine}
+    echo ${expdirvfine}
+    noisetype=blur 	# Which noise type data is used for decoding, possible noisetype: noise music blur and saltandpepper
+    local/training/finetune_video.sh --backend $backend \
+				--ngpu $ngpu \
+				--debugmode $debugmode \
+				--N $N \
+ 				--verbose $verbose \
+				--nbpe $nbpe \
+				--bpemode $bpemode \
+				--nj $nj \
+				--do_delta $do_delta \
+				--preprocess_config $preprocess_config \
+				--train_config $train_config\
+				--lm_config $lm_config \
+				--decode_config $decode_config\
+				 $expdirvfine $expdirvpretrain dump/videotrain dump/avtraindecode $PRETRAINEDMODEL $lmexpdir $noisetype $dict $bpemodel  || exit 1;
+
+    # pretrain audio-visual model
+    expdiravpretrain=exp/pretrain/AV
+    mkdir -p ${expdiravpretrain}
+    echo ${expdiravpretrain}
+    noisetype=noise 	# Which noise type data is used for decoding, possible noisetype: noise music blur and saltandpepper
+    local/training/pretrain_av.sh --backend $backend \
+				--ngpu $ngpu \
+				--debugmode $debugmode \
+				--N $N \
+ 				--verbose $verbose \
+				--nbpe $nbpe \
+				--bpemode $bpemode \
+				--nj $nj \
+				--do_delta $do_delta \
+				--preprocess_config $preprocess_config \
+				--train_config $train_config\
+				--lm_config $lm_config \
+				--decode_config $decode_config\
+			        $expdiravpretrain dump/avpretrain dump/avpretraindecode $lmexpdir \
+ 				$noisetype $dict $bpemodel $expdirapretrain $expdirvpretrain|| exit 1;
+
+    # finetune audio-visual model (final network used for decoding)
+    expdiravfine=exp/fine/AV
+    mkdir -p ${expdiravfine}
+    echo ${expdiravfine}
+    noisetype=noise 	# Which noise type data is used for decoding, possible noisetype: noise music blur and saltandpepper
+    local/training/finetune_av.sh --backend $backend \
+				--ngpu $ngpu \
+				--debugmode $debugmode \
+				--N $N \
+ 				--verbose $verbose \
+				--nbpe $nbpe \
+				--bpemode $bpemode \
+				--nj $nj \
+				--do_delta $do_delta \
+				--preprocess_config $preprocess_config \
+				--train_config $train_config\
+				--lm_config $lm_config \
+				--decode_config $decode_config\
+				$expdiravfine dump/avtrain dump/avtraindecode $PRETRAINEDMODEL $lmexpdir \
+ 				$noisetype $dict $bpemodel $expdiravpretrain|| exit 1;
+
+fi 
+
+exit 0
diff --git a/egs/lrs/avsr1/steps b/egs/lrs/avsr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs/lrs/avsr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs/lrs/avsr1/utils b/egs/lrs/avsr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs/lrs/avsr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs/lrs/asr1/RESULTS.md b/egs/lrs2/asr1/RESULTS.md
similarity index 100%
rename from egs/lrs/asr1/RESULTS.md
rename to egs/lrs2/asr1/RESULTS.md
diff --git a/egs/lrs2/asr1/cmd.sh b/egs/lrs2/asr1/cmd.sh
new file mode 100644
index 00000000000..7b70ef5e06e
--- /dev/null
+++ b/egs/lrs2/asr1/cmd.sh
@@ -0,0 +1,89 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/gpu.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs/lrs2/asr1/conf/decode.yaml b/egs/lrs2/asr1/conf/decode.yaml
new file mode 100644
index 00000000000..98b36d1752e
--- /dev/null
+++ b/egs/lrs2/asr1/conf/decode.yaml
@@ -0,0 +1,7 @@
+batchsize: 0
+beam-size: 60
+ctc-weight: 0.4
+lm-weight: 0.6
+maxlenratio: 0.0
+minlenratio: 0.0
+penalty: 0.0
diff --git a/egs/lrs2/asr1/conf/fbank.conf b/egs/lrs2/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs/lrs2/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs/lrs2/asr1/conf/gpu.conf b/egs/lrs2/asr1/conf/gpu.conf
new file mode 100644
index 00000000000..6d0a75b067a
--- /dev/null
+++ b/egs/lrs2/asr1/conf/gpu.conf
@@ -0,0 +1,10 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0
+option gpu=* -l 'hostname=b1[12345678]*|c*,gpu=$0' -q g.q
\ No newline at end of file
diff --git a/egs/lrs/asr1/conf/lm.yaml b/egs/lrs2/asr1/conf/lm.yaml
similarity index 100%
rename from egs/lrs/asr1/conf/lm.yaml
rename to egs/lrs2/asr1/conf/lm.yaml
diff --git a/egs/lrs2/asr1/conf/pitch.conf b/egs/lrs2/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs/lrs2/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs/lrs2/asr1/conf/queue.conf b/egs/lrs2/asr1/conf/queue.conf
new file mode 100644
index 00000000000..257d7b7b3aa
--- /dev/null
+++ b/egs/lrs2/asr1/conf/queue.conf
@@ -0,0 +1,10 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs/lrs2/asr1/conf/slurm.conf b/egs/lrs2/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs/lrs2/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs/lrs2/asr1/conf/specaug.yaml b/egs/lrs2/asr1/conf/specaug.yaml
new file mode 100644
index 00000000000..c0643d38597
--- /dev/null
+++ b/egs/lrs2/asr1/conf/specaug.yaml
@@ -0,0 +1,16 @@
+process:
+  # these three processes are a.k.a. SpecAugument
+  - type: "time_warp"
+    max_time_warp: 5
+    inplace: true
+    mode: "PIL"
+  - type: "freq_mask"
+    F: 30
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+  - type: "time_mask"
+    T: 40
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
diff --git a/egs/lrs/asr1/conf/train.yaml b/egs/lrs2/asr1/conf/train.yaml
similarity index 100%
rename from egs/lrs/asr1/conf/train.yaml
rename to egs/lrs2/asr1/conf/train.yaml
diff --git a/egs/lrs/asr1/local/README.md b/egs/lrs2/asr1/local/README.md
similarity index 100%
rename from egs/lrs/asr1/local/README.md
rename to egs/lrs2/asr1/local/README.md
diff --git a/egs/lrs/asr1/local/data_preparation.sh b/egs/lrs2/asr1/local/data_preparation.sh
similarity index 100%
rename from egs/lrs/asr1/local/data_preparation.sh
rename to egs/lrs2/asr1/local/data_preparation.sh
diff --git a/egs/lrs/asr1/local/make_files.py b/egs/lrs2/asr1/local/make_files.py
similarity index 100%
rename from egs/lrs/asr1/local/make_files.py
rename to egs/lrs2/asr1/local/make_files.py
diff --git a/egs/lrs/asr1/local/pretrain.py b/egs/lrs2/asr1/local/pretrain.py
similarity index 100%
rename from egs/lrs/asr1/local/pretrain.py
rename to egs/lrs2/asr1/local/pretrain.py
diff --git a/egs/lrs/asr1/path.sh b/egs/lrs2/asr1/path.sh
similarity index 100%
rename from egs/lrs/asr1/path.sh
rename to egs/lrs2/asr1/path.sh
diff --git a/egs/lrs/asr1/run.sh b/egs/lrs2/asr1/run.sh
similarity index 98%
rename from egs/lrs/asr1/run.sh
rename to egs/lrs2/asr1/run.sh
index 06d0773f17e..e4958dd9359 100644
--- a/egs/lrs/asr1/run.sh
+++ b/egs/lrs2/asr1/run.sh
@@ -167,14 +167,14 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         wc -l ${dict}
 
     else
-	gdrive_download '1ZXXCXSbbFS2PDlrs9kbJL9pE6-5nPPxi' 'model.v1.tar.gz'
+	gdrive_download '1ITgdZoa8vQ7lDwi1jLziYGXOyUtgE2ow' 'model.v1.tar.gz'
 	tar -xf model.v1.tar.gz
-	mv avsrlrs2_3/exp/train_rnnlm_pytorch_lm_unigram500 exp/pretrainedlm
-	mv avsrlrs2_3/data/lang_char data/
+	mv model.v1/avsrlrs2_3/exp/train_rnnlm_pytorch_lm_unigram500 exp/pretrainedlm
+	mv model.v1/avsrlrs2_3/data/lang_char data/
     	mv data/lang_char/train_unigram500.model data/lang_char/${train_set}_unigram500.model
     	mv data/lang_char/train_unigram500.vocab data/lang_char/${train_set}_unigram500.vocab
     	mv data/lang_char/train_unigram500_units.txt data/lang_char/${train_set}_unigram500_units.txt
-  	rm -rf avsrlrs2_3
+  	rm -rf model.v1
 	rm -rf model.v1.tar.gz
 	
 	##### it is depands on your corpus, if the corpus text transcription is uppercase, use this to convert to lowercase
diff --git a/egs/lrs2/asr1/steps b/egs/lrs2/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs/lrs2/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs/lrs2/asr1/utils b/egs/lrs2/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs/lrs2/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/README.md b/egs2/README.md
index 6db3e3e0930..d67bdde2e8c 100755
--- a/egs2/README.md
+++ b/egs2/README.md
@@ -6,94 +6,107 @@ See: https://espnet.github.io/espnet/espnet2_tutorial.html#recipes-using-espnet2
 
 ## Overview of example information
 
-| Directory name          | Corpus name                                                                             | Task                    | Language              | URL                                                                                                          | Note         |
-| ----------------------- | --------------------------------------------------------------------------------------- | ----------------------- | --------------------- | ------------------------------------------------------------------------------------------------------------ | ------------ |
-| aidatatang_200zh        | Aidatatang_200zh A free Chinese Mandarin speech corpus                                  | ASR                     | CMN                   | http://www.openslr.org/resources/62                                                                          |              |
-| aishell                 | AISHELL-ASR0009-OS1 Open Source Mandarin Speech Corpus                                  | ASR                     | CMN                   | http://www.aishelltech.com/kysjcp                                                                            |              |
-| aishell3                | AISHELL3 Mandarin multi-speaker text-to-speech                                          | TTS                     | CMN                   | https://www.openslr.org/93/                                                                                  |              |
-| ami                     | The AMI Meeting Corpus                                                                  | ASR                     | ENG                   | http://groups.inf.ed.ac.uk/ami/corpus/                                                                       |              |
-| an4                     | CMU AN4 database                                                                        | ASR/TTS                 | ENG                   | http://www.speech.cs.cmu.edu/databases/an4/                                                                  |              |
-| babel                   | IARPA Babel corups                                                                      | ASR                     | ~20 languages         | https://www.iarpa.gov/index.php/research-programs/babel                                                      |              |
-| bn_openslr53            | Large bengali ASR training dataset                                                      | ASR                     | BEN                   | https://openslr.org/53/                                                                                      |              |
-| catslu               	  | CATSLU-MAPS                                                                             | SLU                     | CMN           	      | https://sites.google.com/view/catslu/home                                                                    |              |
-| chime4                  | The 4th CHiME Speech Separation and Recognition Challenge                               | ASR/Multichannel ASR    | ENG                   | http://spandh.dcs.shef.ac.uk/chime_challenge/chime2016/                                                      |              |
-| cmu_indic               | CMU INDIC                                                                               | TTS                     | 7 languages           | http://festvox.org/cmu_indic/                                                                                |              |
-| commonvoice             | The Mozilla Common Voice                                                                | ASR                     | 13 languages          | https://voice.mozilla.org/datasets                                                                           |              |
-| csj                     | Corpus of Spontaneous Japanese                                                          | ASR                     | JPN                   | https://pj.ninjal.ac.jp/corpus_center/csj/en/                                                                |              |
-| csmsc                   | Chinese Standard Mandarin Speech Copus                                                  | TTS                     | CMN                   | https://www.data-baker.com/open_source.html                                                                  |              |
-| css10                   | CSS10: A Collection of Single Speaker Speech Datasets for 10 Languages                  | TTS                     | 10 langauges          | https://github.com/Kyubyong/css10                                                                            |              |
-| dirha_wsj               | Distant-speech Interaction for Robust Home Applications                                 | Multichannel ASR        | ENG                   | https://dirha.fbk.eu/, https://github.com/SHINE-FBK/DIRHA_English_wsj                                        |              |
-| dns_ins20               | Deep Noise Suppression Challenge – INTERSPEECH 2020                                     | SE                      | 7 languages + singing | https://www.microsoft.com/en-us/research/academic-program/deep-noise-suppression-challenge-interspeech-2020/ |              |
-| dsing                   | Automatic Lyric Transcription from Karaoke Vocal Tracks (From DAMP Sing300x30x2)        | ASR (ALT)               | ENG singing           | https://github.com/groadabike/Kaldi-Dsing-task                                                               |              |
-| fisher_callhome_spanish | Fisher and CALLHOME Spanish--English Speech Translation                                 | ASR/ST                  | SPA->ENG              | https://catalog.ldc.upenn.edu/LDC2014T23                                                                     |              |
-| fsc                     | Fluent Speech Commands Dataset                                                          | SLU                     | ENG                   | https://fluent.ai/fluent-speech-commands-a-dataset-for-spoken-language-understanding-research/               |              |
-| fsc_unseen              | Fluent Speech Commands Dataset MASE Eval Unseen splits                                  | SLU                     | ENG                   | https://github.com/maseEval/mase                                                                             |              |
-| fsc_challenge           | Fluent Speech Commands Dataset MASE Eval Challenge splits                               | SLU                     | ENG                   | https://github.com/maseEval/mase                                                                             |              |
-| gigaspeech              | GigaSpeech: An Evolving, Multi-domain ASR Corpus with 10,000 Hours of Transcribed Audio | ASR                     | ENG                   | https://github.com/SpeechColab/GigaSpeech                                                                    |              |
-| grabo                   | Grabo dataset                                                                           | SLU                     | ENG + NLD             | https://www.esat.kuleuven.be/psi/spraak/downloads/                                                           |              |
-| hkust                   | HKUST/MTS: A very large scale Mandarin telephone speech corpus                          | ASR                     | CMN                   | https://catalog.ldc.upenn.edu/LDC2005S15                                                                     |              |
-| hui_acg                 | HUI-audio-corpus-german                                                                 | TTS                     | DEU                   | https://opendata.iisys.de/datasets.html#hui-audio-corpus-german                                              |              |
-| how2                    | How2: A Large-scale Dataset for Multimodal Language Understanding                       | ASR/MT/ST               | ENG->POR              | https://github.com/srvk/how2-dataset                                                                         |              |
-| iemocap                 | IEMOCAP database: The Interactive Emotional Dyadic Motion Capture database              | SLU                     | ENG                   | https://sail.usc.edu/iemocap/                                                                                |              |
-| iwslt21_low_resource    | ALFFA, IARPA Babel, Gamayun, IWSLT 2021                                                 | ASR                     | SWA                   | http://www.openslr.org/25/ https://catalog.ldc.upenn.edu/LDC2017S05 https://gamayun.translatorswb.org/data/ https://iwslt.org/2021/low-resource |              |
-| jdcinal                 | Japanese Dialogue Corpus of Information Navigation and Attentive Listening Annotated with Extended ISO-24617-2 Dialogue Act Tags              | SLU               | JPN                   | http://www.lrec-conf.org/proceedings/lrec2018/pdf/464.pdf http://tts.speech.cs.cmu.edu/awb/infomation_navigation_and_attentive_listening_0.2.zip |              |
-| jkac                    | J-KAC: Japanese Kamishibai and audiobook corpus                                         | TTS                     | JPN                  | https://sites.google.com/site/shinnosuketakamichi/research-topics/j-kac_corpus                               |              |
-| jmd                     | JMD: Japanese multi-dialect corpus for speech synthesis                                 | TTS                     | JPN                  | https://sites.google.com/site/shinnosuketakamichi/research-topics/jmd_corpus                                 |              |
-| jsss                    | JSSS: Japanese speech corpus for summarization and simplification                       | TTS                     | JPN                  | https://sites.google.com/site/shinnosuketakamichi/research-topics/jsss_corpus                                |              |
-| jsut                    | Japanese speech corpus of Saruwatari-lab., University of Tokyo                          | ASR/TTS                 | JPN                  | https://sites.google.com/site/shinnosuketakamichi/publication/jsut                                           |              |
-| jtubespeech             | Japanese YouTube Speech corpus                                                          | ASR/TTS                 | JPN                  |                                                                                                             |              |
-| jv_openslr35            | Javanese                                                                                | ASR                     | JAV                  | http://www.openslr.org/35                                                                                    |              |
-| jvs                     | JVS (Japanese versatile speech) corpus                                                  | TTS                     | JPN                  | https://sites.google.com/site/shinnosuketakamichi/research-topics/jvs_corpus                                 |              |
-| ksponspeech             | KsponSpeech (Korean spontaneous speech) corpus                                          | ASR                     | KOR                  | https://aihub.or.kr/aidata/105                                                                               |              |
-| kss                     | Korean single speaker corpus                                                            | TTS                     | KOR                  | https://www.kaggle.com/bryanpark/korean-single-speaker-speech-dataset                                        |              |
-| laborotv                | LaboroTVSpeech (A large-scale Japanese speech corpus on TV recordings)                  | ASR                     | JPN                  | https://laboro.ai/column/eg-laboro-tv-corpus-jp                                                              |              |
-| librimix                | LibriMix: An Open-Source Dataset for Generalizable Speech Separation                    | SE                      | ENG                  | https://github.com/JorisCos/LibriMix                                                                         |              |
-| librispeech             | LibriSpeech ASR corpus                                                                  | ASR                     | ENG                  | http://www.openslr.org/12                                                                                    |              |
-| librispeech_100         | LibriSpeech ASR corpus 100h subset                                                      | ASR                     | ENG                  | http://www.openslr.org/12                                                                                    |              |
-| libritts                | LibriTTS corpus                                                                         | TTS                     | ENG                  | http://www.openslr.org/60                                                                                    |              |
-| ljspeech                | The LJ Speech Dataset                                                                   | TTS                     | ENG                  | https://keithito.com/LJ-Speech-Dataset/                                                                      |              |
-| lrs3                    | The Oxford-BBC Lip Reading Sentences 3 (LRS3) Dataset                                   | ASR                     | ENG                  | https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs3.html                                                  |              |
-| lrs2                    | The Oxford-BBC Lip Reading Sentences 2 (LRS2) Dataset                                   | Lipreading/ASR          | ENG                  | https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html                                                  |              |
-| mini_an4                | Mini version of CMU AN4 database for the integration test                               | ASR/TTS/SE              | ENG                  | http://www.speech.cs.cmu.edu/databases/an4/                                                                  |              |
-| mini_librispeech        | Mini version of Librispeech corpus                                                      | DIAR                    | ENG                  | https://openslr.org/31/                                                                                      |              |
-| mls                     | MLS (A large multilingual corpus derived from LibriVox audiobooks)                      | ASR                     | 8 languages          | http://www.openslr.org/94/                                                                                   |              |
-| ms_indic_18             | Microsoft Speech Corpus (Indian languages)                                              | ASR                     | 3 langs: TEL TAM GUJ | https://msropendata.com/datasets/7230b4b1-912d-400e-be58-f84e0512985e                                        |              |
-| nsc                     | National Speech Corpus                                                                  | ASR                     | ENG-SG               | https://www.imda.gov.sg/programme-listing/digital-services-lab/national-speech-corpus                        |              |
-| open_li52               | Corpus combination with 52 languages(Commonvocie + voxforge)                            | Multilingual ASR        | 52 languages         |                                                                                                             |              |
-| polyphone_swiss_french  | Swiss French Polyphone corpus                                                           | ASR                     | FRA                  | http://catalog.elra.info/en-us/repository/browse/ELRA-S0030_02                                               |              |
-| primewords_chinese      | Primewords Chinese Corpus Set 1                                                         | ASR                     | CMN                  | https://www.openslr.org/47/                                                                                  |              |
-| puebla_nahuatl          | Highland Puebla Nahuatl corpus (endangered language in central Mexico)                  | ASR                     | HPN                  | https://www.openslr.org/92/                                                                                  |              |
-| reverb                  | REVERB (REverberant Voice Enhancement and Recognition Benchmark) challenge              | ASR                     | ENG                  | https://reverb2014.dereverberation.com/                                                                      |              |
-| ru_open_stt             | Russian Open Speech To Text (STT/ASR) Dataset                                           | ASR                     | RUS                  | https://github.com/snakers4/open_stt                                                                         |              |
-| ruslan                  | RUSLAN: Russian Spoken Language Corpus For Speech Synthesis                             | TTS                     | RUS                  | https://ruslan-corpus.github.io/                                                                             |              |
-| snips                   | SNIPS: A dataset for spoken language understanding                                      | SLU                     | ENG                  | https://github.com/sonos/spoken-language-understanding-research-datasets                                     |              |
-| seame                   | SEAME: a Mandarin-English Code-switching Speech Corpus in South-East Asia               | ASR                     | ENG + CMN            | https://catalog.ldc.upenn.edu/LDC2015S04                                                                     |              |
-| siwis                   | SIWIS: Spoken Interaction with Interpretation in Switzerland                            | TTS                     | FRA                  | https://https://datashare.ed.ac.uk/handle/10283/2353                                                                 |              |
-| slue-voxceleb           | SLUE: Spoken Language Understanding Evaluation                                          | SLU                     | ENG                  | https://github.com/asappresearch/slue-toolkit                                                                |              |
-| slurp                   | SLURP: A Spoken Language Understanding Resource Package                                 | SLU                     | ENG                  | https://github.com/pswietojanski/slurp                                                                       |              |
-| slurp_entity            | SLURP: A Spoken Language Understanding Resource Package                                 | SLU/Entity Classifi.    | ENG                  | https://github.com/pswietojanski/slurp                                                                       |              |
-| sms_wsj                 | SMS-WSJ: A database for in-depth analysis of multi-channel source separation algorithms | SE                      | ENG                  | https://github.com/fgnt/sms_wsj                                                                              |              |
-| speechcommands          | Speech Commands: A Dataset for Limited-Vocabulary Speech Recognition                    | SLU                     | ENG                  | https://www.tensorflow.org/datasets/catalog/speech_commands                                                  |              |
-| spgispeech              | SPGISpeech 5k corpus                                                                    | ASR                     | ENG                  | https://datasets.kensho.com/datasets/scribe                                                                  |              |
-| su_openslr36            | Sundanese                                                                               | ASR                     | SUN                  | http://www.openslr.org/36                                                                                    |              |
-| swbd                    | Switchboard Corpus for 2-channel Conversational Telephone Speech (300h)                 | ASR                     | ENG                  | https://catalog.ldc.upenn.edu/LDC97S62                                                                       |              |
-| swbd_da                 | NXT Switchboard Annotations                                                             | SLU                     | ENG                  | https://catalog.ldc.upenn.edu/LDC2009T26                                                                     |              |
-| swbd_sentiment          | Speech Sentiment Annotations                                                            | SLU                     | ENG                  | https://catalog.ldc.upenn.edu/LDC2020T14                                                                    |              |
-| tedlium2                | TED-LIUM corpus release 2                                                               | ASR                     | ENG                  | https://www.openslr.org/19/, http://www.lrec-conf.org/proceedings/lrec2014/pdf/1104_Paper.pdf                |              |
-| thchs30                 | A Free Chinese Speech Corpus Released by CSLT@Tsinghua University                       | TTS                     | CMN                  | https://www.openslr.org/18/                                                                                  |              |
-| timit                   | TIMIT Acoustic-Phonetic Continuous Speech Corpus                                        | ASR                     | ENG                  | https://catalog.ldc.upenn.edu/LDC93S1                                                                        |              |
-| totonac                 | Highland Totonac corpus (endangered language in central Mexico)                         | ASR                     | TOS                  | http://www.openslr.org/107/                                                                                  |              |
-| tsukuyomi               | つくよみちゃんコーパス                                                                      | TTS                     | JPN                  | https://tyc.rei-yumesaki.net/material/corpus                                                                 |              |
-| vctk                    | English Multi-speaker Corpus for CSTR Voice Cloning Toolkit                             | ASR/TTS                 | ENG                  | http://www.udialogue.org/download/cstr-vctk-corpus.html                                                      |              |
-| vctk_noisyreverb        | Noisy reverberant speech database (48kHz)                                               | SE                      | ENG                  | https://datashare.ed.ac.uk/handle/10283/2826                                                                 |              |
-| vivos                   | VIVOS (Vietnamese corpus for ASR)                                                       | ASR                     | VIE                  | https://ailab.hcmus.edu.vn/vivos/                                                                            |              |
-| voxforge                | VoxForge                                                                                | ASR                     | 7 languages          | http://www.voxforge.org/                                                                                     |              |
-| wenetspeech             | WenetSpeech: A 10000+ Hours Multi-domain Chinese Corpus for Speech Recognition          | ASR                     | CMN                  | https://wenet-e2e.github.io/WenetSpeech/                                                                     |              |
-| wham                    | The WSJ0 Hipster Ambient Mixtures (WHAM!) dataset                                       | SE                      | ENG                  | https://wham.whisper.ai/                                                                                     |              |
-| whamr                   | WHAMR!: Noisy and Reverberant Single-Channel Speech Separation                          | SE                      | ENG                  | https://wham.whisper.ai/                                                                                     |              |
-| wsj                     | CSR-I (WSJ0) Complete, CSR-II (WSJ1) Complete                                           | ASR                     | ENG                  | https://catalog.ldc.upenn.edu/LDC93S6A,https://catalog.ldc.upenn.edu/LDC94S13A                               |              |
-| wsj0_2mix               | MERL WSJ0-mix multi-speaker dataset                                                     | ASR/SE                  | ENG                  | http://www.merl.com/demos/deep-clustering                                                                    |              |
-| wsj0_2mix_spatialized   | MERL WSJ0-mix multi-speaker dataset (Spatialized version)                               | ASR/Multichannel ASR/SE | ENG                  | http://www.merl.com/demos/deep-clustering                                                                    |              |
-| yesno                   | The "yesno" corpus                                                                      | ASR                     | HEB                  | http://www.openslr.org/1                                                                                     |              |
-| yoloxochitl_mixtec      | Yoloxochitl-Mixtec corpus (endangered language in central Mexico)                       | ASR                     | XTY                  | http://www.openslr.org/89                                                                                    |              |
-| zeroth_korean           | Zeroth-Korean                                                                           | ASR                     | KOR                  | http://www.openslr.org/40                                                                                    |              |
+| Directory name          | Corpus name                                                                                                                      | Task                    | Language              | URL                                                                                                          | Note         |
+|-------------------------|----------------------------------------------------------------------------------------------------------------------------------|-------------------------| --------------------- | ------------------------------------------------------------------------------------------------------------ | ------------ |
+| aidatatang_200zh        | Aidatatang_200zh A free Chinese Mandarin speech corpus                                                                           | ASR                     | CMN                   | http://www.openslr.org/resources/62                                                                          |              |
+| aishell                 | AISHELL-ASR0009-OS1 Open Source Mandarin Speech Corpus                                                                           | ASR                     | CMN                   | http://www.aishelltech.com/kysjcp                                                                            |              |
+| aishell3                | AISHELL3 Mandarin multi-speaker text-to-speech                                                                                   | TTS                     | CMN                   | https://www.openslr.org/93/                                                                                  |              |
+| aishell4                | AISHELL4 Open Source Mandarin Speech Corpus in Conference Scenario                                                               | ASR/SE                  | CMN                   | https://www.openslr.org/111/                                                                                 |              |
+| ami                     | The AMI Meeting Corpus                                                                                                           | ASR                     | ENG                   | http://groups.inf.ed.ac.uk/ami/corpus/                                                                       |              |
+| an4                     | CMU AN4 database                                                                                                                 | ASR/TTS                 | ENG                   | http://www.speech.cs.cmu.edu/databases/an4/                                                                  |              |
+| babel                   | IARPA Babel corups                                                                                                               | ASR                     | ~20 languages         | https://www.iarpa.gov/index.php/research-programs/babel                                                      |              |
+| bn_openslr53            | Large bengali ASR training dataset                                                                                               | ASR                     | BEN                   | https://openslr.org/53/                                                                                      |              |
+| bur_openslr80           | Burmese ASR training dataset                                                                                                     | ASR                     | BUR                   | https://openslr.org/80/                                                                                      |              |
+| catslu               	  | CATSLU-MAPS                                                                                                                      | SLU                     | CMN           	      | https://sites.google.com/view/catslu/home                                                                     |              |
+| chime4                  | The 4th CHiME Speech Separation and Recognition Challenge                                                                        | ASR/Multichannel ASR    | ENG                   | http://spandh.dcs.shef.ac.uk/chime_challenge/chime2016/                                                      |              |
+| clarity21               | The First Clarity Enhancement Challenge CEC1                                                                                     | SE                      | ENG                   | https://claritychallenge.github.io/clarity_CEC1_doc/                                                         |              |
+| cmu_indic               | CMU INDIC                                                                                                                        | TTS                     | 7 languages           | http://festvox.org/cmu_indic/                                                                                |              |
+| commonvoice             | The Mozilla Common Voice                                                                                                         | ASR                     | 13 languages          | https://voice.mozilla.org/datasets                                                                           |              |
+| conferencingspeech21    | Far-field Multi-channel Speech Enhancement Challenge for Video Conferencing (ConferencingSpeech 2021)                            | SE                      | ENG, CMN              | https://tea-lab.qq.com/conferencingspeech-2021                                                               |              |
+| covost2                 | Multilingual speech-to-text translation corpus from Common Voice                                                                 | ST                      | lang pairs from 22    | https://github.com/facebookresearch/covost                                                                   |              |
+| csj                     | Corpus of Spontaneous Japanese                                                                                                   | ASR                     | JPN                   | https://pj.ninjal.ac.jp/corpus_center/csj/en/                                                                |              |
+| csmsc                   | Chinese Standard Mandarin Speech Copus                                                                                           | TTS                     | CMN                   | https://www.data-baker.com/open_source.html                                                                  |              |
+| css10                   | CSS10: A Collection of Single Speaker Speech Datasets for 10 Languages                                                           | TTS                     | 10 langauges          | https://github.com/Kyubyong/css10                                                                            |              |
+| dirha_wsj               | Distant-speech Interaction for Robust Home Applications                                                                          | Multichannel ASR        | ENG                   | https://dirha.fbk.eu/, https://github.com/SHINE-FBK/DIRHA_English_wsj                                        |              |
+| dns_ins20               | Deep Noise Suppression Challenge – INTERSPEECH 2020                                                                          | SE                      | 7 languages +singing  | https://www.microsoft.com/en-us/research/academic-program/deep-noise-suppression-challenge-interspeech-2020/ |              |
+| dns_icassp21            | Deep Noise Suppression Challenge – ICASSP 2021                                                                               | SE                      | 11 languages + singing| https://www.microsoft.com/en-us/research/academic-program/deep-noise-suppression-challenge-icassp-2021/      |              |
+| dns_ins21               | Deep Noise Suppression Challenge – INTERSPEECH 2021                                                                          | SE                      | 11 languages + singing| https://www.microsoft.com/en-us/research/academic-program/deep-noise-suppression-challenge-interspeech-2021/ |              |
+| dsing                   | Automatic Lyric Transcription from Karaoke Vocal Tracks (From DAMP Sing300x30x2)                                                 | ASR (ALT)               | ENG singing           | https://github.com/groadabike/Kaldi-Dsing-task                                                               |              |
+| fisher_callhome_spanish | Fisher and CALLHOME Spanish--English Speech Translation                                                                          | ASR/ST                  | SPA->ENG              | https://catalog.ldc.upenn.edu/LDC2014T23                                                                     |              |
+| fsc                     | Fluent Speech Commands Dataset                                                                                                   | SLU                     | ENG                   | https://fluent.ai/fluent-speech-commands-a-dataset-for-spoken-language-understanding-research/               |              |
+| fsc_unseen              | Fluent Speech Commands Dataset MASE Eval Unseen splits                                                                           | SLU                     | ENG                   | https://github.com/maseEval/mase                                                                             |              |
+| fsc_challenge           | Fluent Speech Commands Dataset MASE Eval Challenge splits                                                                        | SLU                     | ENG                   | https://github.com/maseEval/mase                                                                             |              |
+| gigaspeech              | GigaSpeech: An Evolving, Multi-domain ASR Corpus with 10,000 Hours of Transcribed Audio                                          | ASR                     | ENG                   | https://github.com/SpeechColab/GigaSpeech                                                                    |              |
+| grabo                   | Grabo dataset                                                                                                                    | SLU                     | ENG + NLD             | https://www.esat.kuleuven.be/psi/spraak/downloads/                                                           |              |
+| harpervalley             | HarperValleyBank: A Domain-Specific Spoken Dialog Corpus                                                                            | SLU                     | ENG                   | https://github.com/cricketclub/gridspace-stanford-harper-valley                                                       |              |
+| hkust                   | HKUST/MTS: A very large scale Mandarin telephone speech corpus                                                                   | ASR                     | CMN                   | https://catalog.ldc.upenn.edu/LDC2005S15                                                                     |              |
+| hui_acg                 | HUI-audio-corpus-german                                                                                                          | TTS                     | DEU                   | https://opendata.iisys.de/datasets.html#hui-audio-corpus-german                                              |              |
+| how2                    | How2: A Large-scale Dataset for Multimodal Language Understanding                                                                | ASR/MT/ST               | ENG->POR              | https://github.com/srvk/how2-dataset                                                                         |              |
+| iemocap                 | IEMOCAP database: The Interactive Emotional Dyadic Motion Capture database                                                       | SLU                     | ENG                   | https://sail.usc.edu/iemocap/                                                                                |              |
+| iwslt21_low_resource    | ALFFA, IARPA Babel, Gamayun, IWSLT 2021                                                                                          | ASR                     | SWA                   | http://www.openslr.org/25/ https://catalog.ldc.upenn.edu/LDC2017S05 https://gamayun.translatorswb.org/data/ https://iwslt.org/2021/low-resource |              |
+| jdcinal                 | Japanese Dialogue Corpus of Information Navigation and Attentive Listening Annotated with Extended ISO-24617-2 Dialogue Act Tags | SLU                     | JPN                   | http://www.lrec-conf.org/proceedings/lrec2018/pdf/464.pdf http://tts.speech.cs.cmu.edu/awb/infomation_navigation_and_attentive_listening_0.2.zip |              |
+| jkac                    | J-KAC: Japanese Kamishibai and audiobook corpus                                                                                  | TTS                     | JPN                  | https://sites.google.com/site/shinnosuketakamichi/research-topics/j-kac_corpus                               |              |
+| jmd                     | JMD: Japanese multi-dialect corpus for speech synthesis                                                                          | TTS                     | JPN                  | https://sites.google.com/site/shinnosuketakamichi/research-topics/jmd_corpus                                 |              |
+| jsss                    | JSSS: Japanese speech corpus for summarization and simplification                                                                | TTS                     | JPN                  | https://sites.google.com/site/shinnosuketakamichi/research-topics/jsss_corpus                                |              |
+| jsut                    | Japanese speech corpus of Saruwatari-lab., University of Tokyo                                                                   | ASR/TTS                 | JPN                  | https://sites.google.com/site/shinnosuketakamichi/publication/jsut                                           |              |
+| jtubespeech             | Japanese YouTube Speech corpus                                                                                                   | ASR/TTS                 | JPN                  |                                                                                                             |              |
+| jv_openslr35            | Javanese                                                                                                                         | ASR                     | JAV                  | http://www.openslr.org/35                                                                                    |              |
+| jvs                     | JVS (Japanese versatile speech) corpus                                                                                           | TTS                     | JPN                  | https://sites.google.com/site/shinnosuketakamichi/research-topics/jvs_corpus                                 |              |
+| ksponspeech             | KsponSpeech (Korean spontaneous speech) corpus                                                                                   | ASR                     | KOR                  | https://aihub.or.kr/aidata/105                                                                               |              |
+| kss                     | Korean single speaker corpus                                                                                                     | TTS                     | KOR                  | https://www.kaggle.com/bryanpark/korean-single-speaker-speech-dataset                                        |              |
+| laborotv                | LaboroTVSpeech (A large-scale Japanese speech corpus on TV recordings)                                                           | ASR                     | JPN                  | https://laboro.ai/column/eg-laboro-tv-corpus-jp                                                              |              |
+| librimix                | LibriMix: An Open-Source Dataset for Generalizable Speech Separation                                                             | SE                      | ENG                  | https://github.com/JorisCos/LibriMix                                                                         |              |
+| librispeech             | LibriSpeech ASR corpus                                                                                                           | ASR                     | ENG                  | http://www.openslr.org/12                                                                                    |              |
+| librispeech_100         | LibriSpeech ASR corpus 100h subset                                                                                               | ASR                     | ENG                  | http://www.openslr.org/12                                                                                    |              |
+| libritts                | LibriTTS corpus                                                                                                                  | TTS                     | ENG                  | http://www.openslr.org/60                                                                                    |              |
+| ljspeech                | The LJ Speech Dataset                                                                                                            | TTS                     | ENG                  | https://keithito.com/LJ-Speech-Dataset/                                                                      |              |
+| lrs3                    | The Oxford-BBC Lip Reading Sentences 3 (LRS3) Dataset                                                                            | ASR                     | ENG                  | https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs3.html                                                  |              |
+| lrs2                    | The Oxford-BBC Lip Reading Sentences 2 (LRS2) Dataset                                                                            | Lipreading/ASR          | ENG                  | https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html                                                  |              |
+| mediaspeech             | MediaSpeech: Multilanguage ASR Benchmark and Dataset                                    | ASR                     | FRA                  | https://www.openslr.org/108/                                                                           |              |
+| microsoft_speech        | Microsoft Speech Corpus (Indian languages)                                                                                       | ASR                     | 3 languages          | https://msropendata.com/datasets/7230b4b1-912d-400e-be58-f84e0512985e                                        |              |
+| mini_an4                | Mini version of CMU AN4 database for the integration test                                                                        | ASR/TTS/SE              | ENG                  | http://www.speech.cs.cmu.edu/databases/an4/                                                                  |              |
+| mini_librispeech        | Mini version of Librispeech corpus                                                                                               | DIAR                    | ENG                  | https://openslr.org/31/                                                                                      |              |
+| ml_openslr63            | Crowdsourced high-quality Malayalam multi-speaker speech data                                                                    | ASR                     | MAL                  | https://openslr.org/63/                                                                                      |              |
+| mls                     | MLS (A large multilingual corpus derived from LibriVox audiobooks)                                                               | ASR                     | 8 languages          | http://www.openslr.org/94/                                                                                   |              |
+| mr_openslr64            | OpenSLR Marathi Corpus                                                                                                           | ASR                     | MAR                  | http://www.openslr.org/64/                                                                                   |              |
+| ms_indic_is18           | Microsoft Speech Corpus (Indian languages)                                                                                       | ASR                     | 3 langs: TEL TAM GUJ | https://msropendata.com/datasets/7230b4b1-912d-400e-be58-f84e0512985e                                        |              |
+| nsc                     | National Speech Corpus                                                                                                           | ASR                     | ENG-SG               | https://www.imda.gov.sg/programme-listing/digital-services-lab/national-speech-corpus                        |              |
+| open_li52               | Corpus combination with 52 languages(Commonvocie + voxforge)                                                                     | Multilingual ASR        | 52 languages         |                                                                                                              |              |
+| polyphone_swiss_french  | Swiss French Polyphone corpus                                                                                                    | ASR                     | FRA                  | http://catalog.elra.info/en-us/repository/browse/ELRA-S0030_02                                               |              |
+| primewords_chinese      | Primewords Chinese Corpus Set 1                                                                                                  | ASR                     | CMN                  | https://www.openslr.org/47/                                                                                  |              |
+| puebla_nahuatl          | Highland Puebla Nahuatl corpus (endangered language in central Mexico)                                                           | ASR                     | HPN                  | https://www.openslr.org/92/                                                                                  |              |
+| reverb                  | REVERB (REverberant Voice Enhancement and Recognition Benchmark) challenge                                                       | ASR                     | ENG                  | https://reverb2014.dereverberation.com/                                                                      |              |
+| ru_open_stt             | Russian Open Speech To Text (STT/ASR) Dataset                                                                                    | ASR                     | RUS                  | https://github.com/snakers4/open_stt                                                                         |              |
+| ruslan                  | RUSLAN: Russian Spoken Language Corpus For Speech Synthesis                                                                      | TTS                     | RUS                  | https://ruslan-corpus.github.io/                                                                             |              |
+| snips                   | SNIPS: A dataset for spoken language understanding                                                                               | SLU                     | ENG                  | https://github.com/sonos/spoken-language-understanding-research-datasets                                     |              |
+| seame                   | SEAME: a Mandarin-English Code-switching Speech Corpus in South-East Asia                                                        | ASR                     | ENG + CMN            | https://catalog.ldc.upenn.edu/LDC2015S04                                                                     |              |
+| siwis                   | SIWIS: Spoken Interaction with Interpretation in Switzerland                                                                     | TTS                     | FRA                  | https://datashare.ed.ac.uk/handle/10283/2353                                                                 |              |
+| slue-voxceleb           | SLUE: Spoken Language Understanding Evaluation                                                                                   | SLU                     | ENG                  | https://github.com/asappresearch/slue-toolkit                                                                |              |
+| slurp                   | SLURP: A Spoken Language Understanding Resource Package                                                                          | SLU                     | ENG                  | https://github.com/pswietojanski/slurp                                                                       |              |
+| slurp_entity            | SLURP: A Spoken Language Understanding Resource Package                                                                          | SLU/Entity Classifi.    | ENG                  | https://github.com/pswietojanski/slurp                                                                       |              |
+| sms_wsj                 | SMS-WSJ: A database for in-depth analysis of multi-channel source separation algorithms                                          | SE                      | ENG                  | https://github.com/fgnt/sms_wsj                                                                              |              |
+| speechcommands          | Speech Commands: A Dataset for Limited-Vocabulary Speech Recognition                                                             | SLU                     | ENG                  | https://www.tensorflow.org/datasets/catalog/speech_commands                                                  |              |
+| spgispeech              | SPGISpeech 5k corpus                                                                                                             | ASR                     | ENG                  | https://datasets.kensho.com/datasets/scribe                                                                  |              |
+| su_openslr36            | Sundanese                                                                                                                        | ASR                     | SUN                  | http://www.openslr.org/36                                                                                    |              |
+| swbd                    | Switchboard Corpus for 2-channel Conversational Telephone Speech (300h)                                                          | ASR                     | ENG                  | https://catalog.ldc.upenn.edu/LDC97S62                                                                       |              |
+| swbd_da                 | NXT Switchboard Annotations                                                                                                      | SLU                     | ENG                  | https://catalog.ldc.upenn.edu/LDC2009T26                                                                     |              |
+| swbd_sentiment          | Speech Sentiment Annotations                                                                                                     | SLU                     | ENG                  | https://catalog.ldc.upenn.edu/LDC2020T14                                                                     |              |
+| tedlium2                | TED-LIUM corpus release 2                                                                                                        | ASR                     | ENG                  | https://www.openslr.org/19/, http://www.lrec-conf.org/proceedings/lrec2014/pdf/1104_Paper.pdf                |              |
+| thchs30                 | A Free Chinese Speech Corpus Released by CSLT@Tsinghua University                                                                | TTS                     | CMN                  | https://www.openslr.org/18/                                                                                  |              |
+| timit                   | TIMIT Acoustic-Phonetic Continuous Speech Corpus                                                                                 | ASR                     | ENG                  | https://catalog.ldc.upenn.edu/LDC93S1                                                                        |              |
+| totonac                 | Highland Totonac corpus (endangered language in central Mexico)                                                                  | ASR                     | TOS                  | http://www.openslr.org/107/                                                                                  |              |
+| tsukuyomi               | つくよみちゃんコーパス                                                                                | TTS                     | JPN                  | https://tyc.rei-yumesaki.net/material/corpus                                                                 |              |
+| vctk                    | English Multi-speaker Corpus for CSTR Voice Cloning Toolkit                                                                      | ASR/TTS                 | ENG                  | http://www.udialogue.org/download/cstr-vctk-corpus.html                                                      |              |
+| vctk_noisyreverb        | Noisy reverberant speech database (48kHz)                                                                                        | SE                      | ENG                  | https://datashare.ed.ac.uk/handle/10283/2826                                                                 |              |
+| vivos                   | VIVOS (Vietnamese corpus for ASR)                                                                                                | ASR                     | VIE                  | https://ailab.hcmus.edu.vn/vivos/                                                                            |              |
+| voxforge                | VoxForge                                                                                                                         | ASR                     | 7 languages          | http://www.voxforge.org/                                                                                     |              |
+| wenetspeech             | WenetSpeech: A 10000+ Hours Multi-domain Chinese Corpus for Speech Recognition                                                   | ASR                     | CMN                  | https://wenet-e2e.github.io/WenetSpeech/                                                                     |              |
+| wham                    | The WSJ0 Hipster Ambient Mixtures (WHAM!) dataset                                                                                | SE                      | ENG                  | https://wham.whisper.ai/                                                                                     |              |
+| whamr                   | WHAMR!: Noisy and Reverberant Single-Channel Speech Separation                                                                   | SE                      | ENG                  | https://wham.whisper.ai/                                                                                     |              |
+| wsj                     | CSR-I (WSJ0) Complete, CSR-II (WSJ1) Complete                                                                                    | ASR                     | ENG                  | https://catalog.ldc.upenn.edu/LDC93S6A,https://catalog.ldc.upenn.edu/LDC94S13A                               |              |
+| wsj0_2mix               | MERL WSJ0-mix multi-speaker dataset                                                                                              | ASR/SE                  | ENG                  | http://www.merl.com/demos/deep-clustering                                                                    |              |
+| wsj0_2mix_spatialized   | MERL WSJ0-mix multi-speaker dataset (Spatialized version)                                                                        | ASR/Multichannel ASR/SE | ENG                  | http://www.merl.com/demos/deep-clustering                                                                    |              |
+| yesno                   | The "yesno" corpus                                                                                                               | ASR                     | HEB                  | http://www.openslr.org/1                                                                                     |              |
+| yoloxochitl_mixtec      | Yoloxochitl-Mixtec corpus (endangered language in central Mexico)                                                                | ASR                     | XTY                  | http://www.openslr.org/89                                                                                    |              |
+| zeroth_korean           | Zeroth-Korean                                                                                                                    | ASR                     | KOR                  | http://www.openslr.org/40                                                                                    |              |
+| zh_openslr38            | ST-CMDS-20170001_1, Free ST Chinese Mandarin Corpus                                                                              | ASR                     | CMN                  | http://www.openslr.org/38                                                                                    |              |
diff --git a/egs2/TEMPLATE/asr1/db.sh b/egs2/TEMPLATE/asr1/db.sh
index 6d9b35313b2..3d443b38a84 100755
--- a/egs2/TEMPLATE/asr1/db.sh
+++ b/egs2/TEMPLATE/asr1/db.sh
@@ -1,16 +1,20 @@
 # Set the path of your corpus
 # "downloads" means the corpus can be downloaded by the recipe automatically
 
+ACCENTED_FR=downloads
 AIDATATANG_200ZH=downloads
 AISHELL=downloads
 AISHELL3=downloads
 AISHELL4=downloads
 ALFFA=downloads
 AN4=downloads
+AUDIOSET=
 DIRHA_ENGLISH_PHDEV=
 DIRHA_WSJ=
 DIRHA_WSJ_PROCESSED="${PWD}/data/local/dirha_wsj_processed"  # Output file path
 DNS=
+DNS2=
+DNS3=
 DSING=downloads
 WSJ0=
 WSJ1=
@@ -42,11 +46,13 @@ LIBRILIGHT_LIMITED=
 FSC=
 SLURP=
 VOXCELEB=
+MEDIASPEECH=downloads
 MINI_LIBRISPEECH=downloads
 MISP2021=
 LIBRIMIX=downloads
 LIBRITTS=
 LJSPEECH=downloads
+MUSAN=
 NSC=
 JMD=downloads
 JSSS=downloads
@@ -64,6 +70,7 @@ TSUKUYOMI=downloads
 VOXFORGE=downloads
 AMI=
 COMMONVOICE=downloads
+MICROSOFT_SPEECH_CORPUS=
 BABEL_101=
 BABEL_102=
 BABEL_103=
@@ -128,7 +135,12 @@ PRIMEWORDS_CHINESE=downloads
 SEAME=
 BENGALI=downloads
 IWSLT14=
+BURMESE=downloads
+MALAYALAM=downloads
+ST_CMDS=downloads
 MS_INDIC_IS18=
+MARATHI=downloads
+HARPERVALLEY=downloads
 
 # For only CMU TIR environment
 if [[ "$(hostname)" == tir* ]]; then
@@ -206,6 +218,7 @@ if [[ "$(hostname -d)" == clsp.jhu.edu ]]; then
     FSC=
     SNIPS= # smart-light-en-closed-field data path
     SLURP=
+    MEDIASPEECH=downloads
     MINI_LIBRISPEECH=downloads
     LIBRITTS=
     LJSPEECH=downloads
@@ -271,4 +284,6 @@ if [[ "$(hostname -d)" == clsp.jhu.edu ]]; then
     IWSLT21LR=downloads/iwslt21
     TOTONAC=downloads
     GOOGLEI18N=downloads
+    MALAYALAM=
+
 fi
diff --git a/egs2/TEMPLATE/asr1/pyscripts/utils/convert_text_to_phn.py b/egs2/TEMPLATE/asr1/pyscripts/utils/convert_text_to_phn.py
index 21f8f4daf46..a6605409f15 100755
--- a/egs2/TEMPLATE/asr1/pyscripts/utils/convert_text_to_phn.py
+++ b/egs2/TEMPLATE/asr1/pyscripts/utils/convert_text_to_phn.py
@@ -1,15 +1,18 @@
 #!/usr/bin/env python3
 
-# Copyright 2021 Tomoki Hayashi
+# Copyright 2021 Tomoki Hayashi and Gunnar Thor
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 
 """Convert kaldi-style text into phonemized sentences."""
 
 import argparse
 import codecs
+import contextlib
 
 from joblib import delayed
 from joblib import Parallel
+from joblib import parallel
+from tqdm import tqdm
 
 from espnet2.text.cleaner import TextCleaner
 from espnet2.text.phoneme_tokenizer import PhonemeTokenizer
@@ -34,13 +37,40 @@ def main():
     text = {line.split()[0]: " ".join(line.split()[1:]) for line in lines}
     if cleaner is not None:
         text = {k: cleaner(v) for k, v in text.items()}
-    phns_list = Parallel(n_jobs=args.nj)(
-        [delayed(phoneme_tokenizer.text2tokens)(sentence) for sentence in text.values()]
-    )
+    with tqdm_joblib(tqdm(total=len(text.values()), desc="Phonemizing")):
+        phns_list = Parallel(n_jobs=args.nj)(
+            [
+                delayed(phoneme_tokenizer.text2tokens)(sentence)
+                for sentence in text.values()
+            ]
+        )
     with codecs.open(args.out_text, "w", encoding="utf8") as g:
         for utt_id, phns in zip(text.keys(), phns_list):
             g.write(f"{utt_id} " + " ".join(phns) + "\n")
 
 
+@contextlib.contextmanager
+def tqdm_joblib(tqdm_object):
+    """Patch joblib to report into tqdm progress bar given as argument.
+
+    Reference:
+        https://stackoverflow.com/questions/24983493
+
+    """
+
+    class TqdmBatchCompletionCallback(parallel.BatchCompletionCallBack):
+        def __call__(self, *args, **kwargs):
+            tqdm_object.update(n=self.batch_size)
+            return super().__call__(*args, **kwargs)
+
+    old_batch_callback = parallel.BatchCompletionCallBack
+    parallel.BatchCompletionCallBack = TqdmBatchCompletionCallback
+    try:
+        yield tqdm_object
+    finally:
+        parallel.BatchCompletionCallBack = old_batch_callback
+        tqdm_object.close()
+
+
 if __name__ == "__main__":
     main()
diff --git a/egs2/TEMPLATE/asr1/pyscripts/utils/extract_xvectors.py b/egs2/TEMPLATE/asr1/pyscripts/utils/extract_xvectors.py
index d7e7804a13f..e64b82dc515 100755
--- a/egs2/TEMPLATE/asr1/pyscripts/utils/extract_xvectors.py
+++ b/egs2/TEMPLATE/asr1/pyscripts/utils/extract_xvectors.py
@@ -105,7 +105,7 @@ def main(argv):
                 xvectors.append(embeds)
 
             # Speaker Normalization
-            xvectors = np.mean(np.concatenate(xvectors, 0), 0)
+            embeds = np.mean(np.stack(xvectors, 0), 0)
             writer_spk[speaker] = embeds
         writer_utt.close()
         writer_spk.close()
diff --git a/egs2/TEMPLATE/asr1/scripts/utils/show_translation_result.sh b/egs2/TEMPLATE/asr1/scripts/utils/show_translation_result.sh
new file mode 100755
index 00000000000..c1c1bdf0882
--- /dev/null
+++ b/egs2/TEMPLATE/asr1/scripts/utils/show_translation_result.sh
@@ -0,0 +1,67 @@
+#!/usr/bin/env bash
+mindepth=0
+maxdepth=3
+case=tc
+
+. utils/parse_options.sh
+
+if [ $# -gt 1 ]; then
+    echo "Usage: $0 --mindepth 0 --maxdepth 1 [exp]" 1>&2
+    echo ""
+    echo "Show the system environments and the evaluation results in Markdown format."
+    echo 'The default of <exp> is "exp/".'
+    exit 1
+fi
+
+[ -f ./path.sh ] && . ./path.sh
+set -euo pipefail
+if [ $# -eq 1 ]; then
+    exp=$1
+else
+    exp=exp
+fi
+
+
+cat << EOF
+<!-- Generated by $0 -->
+# RESULTS
+## Environments
+- date: \`$(LC_ALL=C date)\`
+EOF
+
+python3 << EOF
+import sys, espnet, torch
+pyversion = sys.version.replace('\n', ' ')
+
+print(f"""- python version: \`{pyversion}\`
+- espnet version: \`espnet {espnet.__version__}\`
+- pytorch version: \`pytorch {torch.__version__}\`""")
+EOF
+
+cat << EOF
+- Git hash: \`$(git rev-parse HEAD)\`
+  - Commit date: \`$(git log -1 --format='%cd')\`
+
+EOF
+
+metrics="bleu"
+
+while IFS= read -r expdir; do
+    if ls "${expdir}"/*/*/score_*/result.${case}.txt &> /dev/null; then
+        echo "## $(basename ${expdir})"
+        for type in $metrics; do
+                	cat << EOF
+### ${type^^}
+
+|dataset|bleu_score|verbose_score|
+|---|---|---|
+EOF
+    data=$(echo "${expdir}"/*/*/score_*/result.${case}.txt | cut -d '/' -f4)
+    bleu=$(sed -n '5p' "${expdir}"/*/*/score_*/result.${case}.txt | cut -d ' ' -f 3 | tr -d ',')
+    verbose=$(sed -n '7p' "${expdir}"/*/*/score_*/result.${case}.txt | cut -d ' ' -f 3- | tr -d '",')
+    echo "${data}|${bleu}|${verbose}"
+
+        done
+    fi
+
+done < <(find ${exp} -mindepth ${mindepth} -maxdepth ${maxdepth} -type d)
diff --git a/egs2/TEMPLATE/diar1/scripts/utils/create_README_file.py b/egs2/TEMPLATE/diar1/scripts/utils/create_README_file.py
index a18aed64ab6..0fe3405603d 120000
--- a/egs2/TEMPLATE/diar1/scripts/utils/create_README_file.py
+++ b/egs2/TEMPLATE/diar1/scripts/utils/create_README_file.py
@@ -1 +1 @@
-egs2/TEMPLATE/asr1/scripts/utils/create_README_file.py
\ No newline at end of file
+../../../asr1/scripts/utils/create_README_file.py
\ No newline at end of file
diff --git a/egs2/TEMPLATE/diar1/scripts/utils/get_model_names.py b/egs2/TEMPLATE/diar1/scripts/utils/get_model_names.py
index 0b4eaaf09a8..b163314a6c5 120000
--- a/egs2/TEMPLATE/diar1/scripts/utils/get_model_names.py
+++ b/egs2/TEMPLATE/diar1/scripts/utils/get_model_names.py
@@ -1 +1 @@
-egs2/TEMPLATE/asr1/scripts/utils/get_model_names.py
\ No newline at end of file
+../../../asr1/scripts/utils/get_model_names.py
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh1/README.md b/egs2/TEMPLATE/enh1/README.md
index 2d7e7aa542b..1b9984979d4 100644
--- a/egs2/TEMPLATE/enh1/README.md
+++ b/egs2/TEMPLATE/enh1/README.md
@@ -40,6 +40,7 @@ Format scp files such as `wav.scp`. The scp files include:
   + `spk{}.scp`: wav file list of speech reference signals. {} can be 1, 2, ..., depending on the number of speakers in the input signal in `wav.scp`.
   + `noise{}.scp` (optional): wav file list of noise reference signals. {} can be 1, 2, ..., depending on the number of noise types in the input signal in `wav.scp`. The file(s) are required when `--use_noise_ref true` is specified. Also related to the variable `noise_type_num`.
   + `dereverb{}.scp` (optional): wav file list of dereverberation reference signals (for training a dereverberation model). This file is required when `--use_dereverb_ref true` is specified. Also related to the variable `dereverb_ref_num`.
+  + `utt2category`: (optional) the category info of each utterance. This file can help the batch sampler to load the same category utterances in each batch. One usage case is that users want to load the simulation data and real data in different batches.
 
 #### Stage 4: Remove short data
 This stage is same as that in ASR recipe.
diff --git a/egs2/TEMPLATE/enh1/enh.sh b/egs2/TEMPLATE/enh1/enh.sh
index cb6e9e8503b..db170043db6 100755
--- a/egs2/TEMPLATE/enh1/enh.sh
+++ b/egs2/TEMPLATE/enh1/enh.sh
@@ -76,8 +76,10 @@ inference_model=valid.loss.ave.pth
 download_model=
 
 # Evaluation related
-scoring_protocol="STOI SDR SAR SIR"
+scoring_protocol="STOI SDR SAR SIR SI_SNR"
 ref_channel=0
+inference_tag=  # Prefix to the result dir for ENH inference.
+inference_enh_config= # Config for enhancement.
 score_with_asr=false
 asr_exp=""       # asr model for scoring WER
 lm_exp=""       # lm model for scoring WER
@@ -151,8 +153,9 @@ Options:
     --init_param    # pretrained model path and module name (default="${init_param}")
 
     # Enhancement related
-    --inference_args   # Arguments for enhancement in the inference stage (default="${inference_args}")
-    --inference_model  # Enhancement model path for inference (default="${inference_model}").
+    --inference_args       # Arguments for enhancement in the inference stage (default="${inference_args}")
+    --inference_model      # Enhancement model path for inference (default="${inference_model}").
+    --inference_enh_config # Configuration file for overwriting some model attributes during SE inference. (default="${inference_enh_config}")
 
     # Evaluation related
     --scoring_protocol    # Metrics to be used for scoring (default="${scoring_protocol}")
@@ -201,6 +204,9 @@ fi
 [ -z "${valid_set}" ] &&   { log "${help_message}"; log "Error: --valid_set is required"  ; exit 2; };
 [ -z "${test_sets}" ] && { log "${help_message}"; log "Error: --test_sets is required"; exit 2; };
 
+# Extra files for enhancement process
+utt_extra_files="utt2category"
+
 data_feats=${dumpdir}/raw
 
 
@@ -247,6 +253,14 @@ if [ -n "${speed_perturb_factors}" ]; then
   enh_exp="${enh_exp}_sp"
 fi
 
+if [ -z "${inference_tag}" ]; then
+    if [ -n "${inference_enh_config}" ]; then
+        inference_tag="$(basename "${inference_enh_config}" .yaml)"
+    else
+        inference_tag=enhanced
+    fi
+fi
+
 # ========================== Main stages start from here. ==========================
 
 if ! "${skip_data_prep}"; then
@@ -267,7 +281,7 @@ if ! "${skip_data_prep}"; then
 
            for factor in ${speed_perturb_factors}; do
                if [[ $(bc <<<"${factor} != 1.0") == 1 ]]; then
-                   scripts/utils/perturb_enh_data_dir_speed.sh "${factor}" "data/${train_set}" "data/${train_set}_sp${factor}" "${_scp_list}"
+                   scripts/utils/perturb_enh_data_dir_speed.sh --utt_extra_files "${utt_extra_files}" "${factor}" "data/${train_set}" "data/${train_set}_sp${factor}" "${_scp_list}"
                    _dirs+="data/${train_set}_sp${factor} "
                else
                    # If speed factor is 1, same as the original
@@ -614,7 +628,7 @@ if ! "${skip_eval}"; then
 
         for dset in "${valid_set}" ${test_sets}; do
             _data="${data_feats}/${dset}"
-            _dir="${enh_exp}/enhanced_${dset}"
+            _dir="${enh_exp}/${inference_tag}_${dset}"
             _logdir="${_dir}/logdir"
             mkdir -p "${_logdir}"
 
@@ -646,6 +660,7 @@ if ! "${skip_eval}"; then
                     --data_path_and_name_and_type "${_data}/${_scp},speech_mix,${_type}" \
                     --key_file "${_logdir}"/keys.JOB.scp \
                     --train_config "${enh_exp}"/config.yaml \
+                    ${inference_enh_config:+--inference_config "$inference_enh_config"} \
                     --model_file "${enh_exp}"/"${inference_model}" \
                     --output_dir "${_logdir}"/output.JOB \
                     ${_opts} ${inference_args}
@@ -686,7 +701,7 @@ if ! "${skip_eval}"; then
                 if "${score_obs}"; then
                     _dir="${data_feats}/${dset}/scoring"
                 else
-                    _dir="${enh_exp}/enhanced_${dset}/scoring"
+                    _dir="${enh_exp}/${inference_tag}_${dset}/scoring"
                 fi
 
                 _logdir="${_dir}/logdir"
@@ -713,7 +728,7 @@ if ! "${skip_eval}"; then
                         # To compute the score of observation, input original wav.scp
                         _inf_scp+="--inf_scp ${data_feats}/${dset}/wav.scp "
                     else
-                        _inf_scp+="--inf_scp ${enh_exp}/enhanced_${dset}/spk${spk}.scp "
+                        _inf_scp+="--inf_scp ${enh_exp}/${inference_tag}_${dset}/spk${spk}.scp "
                     fi
                 done
 
@@ -749,7 +764,7 @@ if ! "${skip_eval}"; then
             ./scripts/utils/show_enh_score.sh "${_dir}/../.." > "${_dir}/../../RESULTS.md"
         done
         log "Evaluation result for observation: ${data_feats}/RESULTS.md"
-        log "Evaluation result for enhancement: ${enh_exp}/enhanced/RESULTS.md"
+        log "Evaluation result for enhancement: ${enh_exp}/RESULTS.md"
 
     fi
 else
@@ -808,7 +823,7 @@ if "${score_with_asr}"; then
                         # Using same wav.scp for all speakers
                         cp "${_data}/wav.scp" "${_ddir}/wav.scp"
                     else
-                        cp "${enh_exp}/enhanced_${dset}/scoring/wav_spk${spk}" "${_ddir}/wav.scp"
+                        cp "${enh_exp}/${inference_tag}_${dset}/scoring/wav_spk${spk}" "${_ddir}/wav.scp"
                     fi
                     cp data/${dset}/text_spk${spk} ${_ddir}/text
                     cp ${_data}/{spk2utt,utt2spk,utt2num_samples,feats_type} ${_ddir}
diff --git a/egs2/TEMPLATE/enh1/scripts/utils/create_README_file.py b/egs2/TEMPLATE/enh1/scripts/utils/create_README_file.py
index a18aed64ab6..0fe3405603d 120000
--- a/egs2/TEMPLATE/enh1/scripts/utils/create_README_file.py
+++ b/egs2/TEMPLATE/enh1/scripts/utils/create_README_file.py
@@ -1 +1 @@
-egs2/TEMPLATE/asr1/scripts/utils/create_README_file.py
\ No newline at end of file
+../../../asr1/scripts/utils/create_README_file.py
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh1/scripts/utils/get_model_names.py b/egs2/TEMPLATE/enh1/scripts/utils/get_model_names.py
index 0b4eaaf09a8..b163314a6c5 120000
--- a/egs2/TEMPLATE/enh1/scripts/utils/get_model_names.py
+++ b/egs2/TEMPLATE/enh1/scripts/utils/get_model_names.py
@@ -1 +1 @@
-egs2/TEMPLATE/asr1/scripts/utils/get_model_names.py
\ No newline at end of file
+../../../asr1/scripts/utils/get_model_names.py
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh1/scripts/utils/perturb_enh_data_dir_speed.sh b/egs2/TEMPLATE/enh1/scripts/utils/perturb_enh_data_dir_speed.sh
index 1d0a0fc3c3b..04887e10f30 100755
--- a/egs2/TEMPLATE/enh1/scripts/utils/perturb_enh_data_dir_speed.sh
+++ b/egs2/TEMPLATE/enh1/scripts/utils/perturb_enh_data_dir_speed.sh
@@ -27,6 +27,9 @@
 export LC_ALL=C
 set -euo pipefail
 
+utt_extra_files=
+. utils/parse_options.sh
+
 if [[ $# != 4 ]]; then
     echo "Usage: perturb_data_dir_speed.sh <warping-factor> <srcdir> <destdir> <scp_files>"
     echo "e.g.:"
@@ -108,17 +111,15 @@ for scp_file in ${scp_files};do
   fi
 done
 
-if [[ -f ${srcdir}/text ]]; then
-    utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/text >"${destdir}"/text
-fi
+for x in text utt2lang ${utt_extra_files}; do
+    if [[ -f ${srcdir}/${x} ]]; then
+        utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/${x} >"${destdir}"/${x}
+    fi
+done
 if [[ -f ${srcdir}/spk2gender ]]; then
     utils/apply_map.pl -f 1 "${destdir}"/spk_map <"${srcdir}"/spk2gender >"${destdir}"/spk2gender
 fi
-if [[ -f ${srcdir}/utt2lang ]]; then
-    utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/utt2lang >"${destdir}"/utt2lang
-fi
-
 rm "${destdir}"/spk_map "${destdir}"/utt_map "${destdir}"/reco_map 2>/dev/null
 echo "$0: generated speed-perturbed version of data in ${srcdir}, in ${destdir}"
-
+utils/fix_data_dir.sh "${destdir}"
 utils/validate_data_dir.sh --no-feats --no-text "${destdir}"
diff --git a/egs2/TEMPLATE/enh1/scripts/utils/show_enh_score.sh b/egs2/TEMPLATE/enh1/scripts/utils/show_enh_score.sh
index 66fb9bc81c2..289874573ea 100755
--- a/egs2/TEMPLATE/enh1/scripts/utils/show_enh_score.sh
+++ b/egs2/TEMPLATE/enh1/scripts/utils/show_enh_score.sh
@@ -15,7 +15,7 @@ fi
 [ -f ./path.sh ] && . ./path.sh
 set -euo pipefail
 if [ $# -eq 1 ]; then
-    exp=$1
+    exp=$(realpath "$1")
 else
     exp=exp
 fi
@@ -51,7 +51,7 @@ while IFS= read -r expdir; do
         metrics=()
         heading="\n|dataset|"
         sep="|---|"
-        for type in pesq stoi sar sdr sir si_snr; do
+        for type in pesq estoi stoi sar sdr sir si_snr; do
             if ls "${expdir}"/*/scoring/result_${type}.txt &> /dev/null; then
                 metrics+=("$type")
                 heading+="${type^^}|"
diff --git a/egs2/TEMPLATE/enh_asr1/cmd.sh b/egs2/TEMPLATE/enh_asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/TEMPLATE/enh_asr1/conf/fbank.conf b/egs2/TEMPLATE/enh_asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/TEMPLATE/enh_asr1/conf/pbs.conf b/egs2/TEMPLATE/enh_asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/TEMPLATE/enh_asr1/conf/pitch.conf b/egs2/TEMPLATE/enh_asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/TEMPLATE/enh_asr1/conf/queue.conf b/egs2/TEMPLATE/enh_asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/TEMPLATE/enh_asr1/conf/slurm.conf b/egs2/TEMPLATE/enh_asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/TEMPLATE/enh_asr1/db.sh b/egs2/TEMPLATE/enh_asr1/db.sh
new file mode 120000
index 00000000000..318d781d123
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/db.sh
@@ -0,0 +1 @@
+../asr1/db.sh
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_asr1/enh_asr.sh b/egs2/TEMPLATE/enh_asr1/enh_asr.sh
new file mode 100755
index 00000000000..fc720ddf94b
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/enh_asr.sh
@@ -0,0 +1,1655 @@
+#!/usr/bin/env bash
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+min() {
+  local a b
+  a=$1
+  for b in "$@"; do
+      if [ "${b}" -le "${a}" ]; then
+          a="${b}"
+      fi
+  done
+  echo "${a}"
+}
+SECONDS=0
+
+# General configuration
+stage=1              # Processes starts from the specified stage.
+stop_stage=10000     # Processes is stopped at the specified stage.
+skip_data_prep=false # Skip data preparation stages.
+skip_train=false     # Skip training stages.
+skip_eval=false      # Skip decoding and evaluation stages.
+skip_upload_hf=true  # Skip uploading to hugging face stages.
+ngpu=1               # The number of gpus ("0" uses cpu, otherwise use gpu).
+num_nodes=1          # The number of nodes.
+nj=32                # The number of parallel jobs.
+inference_nj=32      # The number of parallel jobs in decoding.
+gpu_inference=false  # Whether to perform gpu decoding.
+dumpdir=dump         # Directory to dump features.
+expdir=exp           # Directory to save experiments.
+python=python3       # Specify python to execute espnet commands.
+
+# Data preparation related
+local_data_opts= # The options given to local/data.sh.
+
+# Speed perturbation related
+speed_perturb_factors=  # perturbation factors, e.g. "0.9 1.0 1.1" (separated by space).
+
+# Feature extraction related
+feats_type=raw       # Feature type (raw or fbank_pitch).
+audio_format=flac    # Audio format: wav, flac, wav.ark, flac.ark  (only in feats_type=raw).
+fs=16k               # Sampling rate.
+min_wav_duration=0.1 # Minimum duration in second.
+max_wav_duration=20  # Maximum duration in second.
+
+# Tokenization related
+token_type=bpe      # Tokenization type (char or bpe).
+nbpe=30             # The number of BPE vocabulary.
+bpemode=unigram     # Mode of BPE (unigram or bpe).
+oov="<unk>"         # Out of vocabulary symbol.
+blank="<blank>"     # CTC blank symbol
+sos_eos="<sos/eos>" # sos and eos symbole
+bpe_input_sentence_size=100000000 # Size of input sentence for BPE.
+bpe_nlsyms=         # non-linguistic symbols list, separated by a comma, for BPE
+bpe_char_cover=1.0  # character coverage when modeling BPE
+
+# Ngram model related
+use_ngram=false
+ngram_exp=
+ngram_num=3
+
+# Language model related
+use_lm=true       # Use language model for ASR decoding.
+lm_tag=           # Suffix to the result dir for language model training.
+lm_exp=           # Specify the directory path for LM experiment.
+                  # If this option is specified, lm_tag is ignored.
+lm_stats_dir=     # Specify the directory path for LM statistics.
+lm_config=        # Config for language model training.
+lm_args=          # Arguments for language model training, e.g., "--max_epoch 10".
+                  # Note that it will overwrite args in lm config.
+use_word_lm=false # Whether to use word language model.
+num_splits_lm=1   # Number of splitting for lm corpus.
+# shellcheck disable=SC2034
+word_vocab_size=10000 # Size of word vocabulary.
+
+# ASR model related
+enh_asr_tag=       # Suffix to the result dir for asr model training.
+enh_asr_exp=       # Specify the directory path for ASR experiment.
+                   # If this option is specified, enh_asr_tag is ignored.
+enh_asr_stats_dir= # Specify the directory path for ASR statistics.
+enh_asr_config=    # Config for asr model training.
+enh_asr_args=      # Arguments for asr model training, e.g., "--max_epoch 10".
+                   # Note that it will overwrite args in asr config.
+pretrained_model=          # Pretrained model to load
+ignore_init_mismatch=false # Ignore initial mismatch
+feats_normalize=global_mvn # Normalizaton layer type.
+num_splits_asr=1           # Number of splitting for lm corpus.
+
+# Upload model related
+hf_repo=
+
+# Decoding related
+use_k2=false      # Whether to use k2 based decoder
+batch_size=1
+inference_tag=    # Suffix to the result dir for decoding.
+inference_config= # Config for decoding.
+asr_inference_args= # Arguments for decoding, e.g., "--lm_weight 0.1".
+                    # Note that it will overwrite args in inference config.
+enh_inference_args="--normalize_output_wav true"
+inference_lm=valid.loss.ave.pth       # Language model path for decoding.
+inference_ngram=${ngram_num}gram.bin
+inference_enh_asr_model=valid.acc.ave.pth # ASR model path for decoding.
+                                          # e.g.
+                                          # inference_enh_asr_model=train.loss.best.pth
+                                          # inference_enh_asr_model=3epoch.pth
+                                          # inference_enh_asr_model=valid.acc.best.pth
+                                          # inference_enh_asr_model=valid.loss.ave.pth
+download_model= # Download a model from Model Zoo and use it for decoding.
+
+# Enhancement related arguments
+spk_num=1   # Number of speakers
+noise_type_num=1
+dereverb_ref_num=1
+# Evaluation related
+scoring_protocol="STOI SDR SAR SIR SI_SNR"
+ref_channel=0
+inference_enh_tag=      # Prefix to the result dir for ENH inference.
+inference_enh_config=   # Config for enhancement.
+
+# Enh Training data related
+use_dereverb_ref=false
+use_noise_ref=false
+
+# [Task dependent] Set the datadir name created by local/data.sh
+train_set=       # Name of training set.
+valid_set=       # Name of validation set used for monitoring/tuning network training.
+test_sets=       # Names of test sets. Multiple items (e.g., both dev and eval sets) can be specified.
+bpe_train_text=  # Text file path of bpe training set.
+lm_train_text=   # Text file path of language model training set.
+lm_dev_text=     # Text file path of language model development set.
+lm_test_text=    # Text file path of language model evaluation set.
+nlsyms_txt=none  # Non-linguistic symbol list if existing.
+cleaner=none     # Text cleaner.
+g2p=none         # g2p method (needed if token_type=phn).
+lang=noinfo      # The language type of corpus.
+score_opts=                # The options given to sclite scoring
+local_score_opts=          # The options given to local/score.sh.
+enh_asr_speech_fold_length=800 # fold_length for speech data during ASR training.
+enh_asr_text_fold_length=150   # fold_length for text data during ASR training.
+lm_fold_length=150         # fold_length for LM training.
+
+help_message=$(cat << EOF
+Usage: $0 --train-set "<train_set_name>" --valid-set "<valid_set_name>" --test_sets "<test_set_names>"
+
+Options:
+    # General configuration
+    --stage          # Processes starts from the specified stage (default="${stage}").
+    --stop_stage     # Processes is stopped at the specified stage (default="${stop_stage}").
+    --skip_data_prep # Skip data preparation stages (default="${skip_data_prep}").
+    --skip_train     # Skip training stages (default="${skip_train}").
+    --skip_eval      # Skip decoding and evaluation stages (default="${skip_eval}").
+    --skip_upload_hf    # Skip packing and uploading stages (default="${skip_upload_hf}").
+    --ngpu           # The number of gpus ("0" uses cpu, otherwise use gpu, default="${ngpu}").
+    --num_nodes      # The number of nodes (default="${num_nodes}").
+    --nj             # The number of parallel jobs (default="${nj}").
+    --inference_nj   # The number of parallel jobs in decoding (default="${inference_nj}").
+    --gpu_inference  # Whether to perform gpu decoding (default="${gpu_inference}").
+    --dumpdir        # Directory to dump features (default="${dumpdir}").
+    --expdir         # Directory to save experiments (default="${expdir}").
+    --python         # Specify python to execute espnet commands (default="${python}").
+
+    # Data preparation related
+    --local_data_opts # The options given to local/data.sh (default="${local_data_opts}").
+
+    # Speed perturbation related
+    --speed_perturb_factors # speed perturbation factors, e.g. "0.9 1.0 1.1" (separated by space, default="${speed_perturb_factors}").
+
+    # Feature extraction related
+    --feats_type       # Feature type (raw, fbank_pitch or extracted, default="${feats_type}").
+    --audio_format     # Audio format: wav, flac, wav.ark, flac.ark  (only in feats_type=raw, default="${audio_format}").
+    --fs               # Sampling rate (default="${fs}").
+    --min_wav_duration # Minimum duration in second (default="${min_wav_duration}").
+    --max_wav_duration # Maximum duration in second (default="${max_wav_duration}").
+
+    # Tokenization related
+    --token_type              # Tokenization type (char or bpe, default="${token_type}").
+    --nbpe                    # The number of BPE vocabulary (default="${nbpe}").
+    --bpemode                 # Mode of BPE (unigram or bpe, default="${bpemode}").
+    --oov                     # Out of vocabulary symbol (default="${oov}").
+    --blank                   # CTC blank symbol (default="${blank}").
+    --sos_eos                 # sos and eos symbole (default="${sos_eos}").
+    --bpe_input_sentence_size # Size of input sentence for BPE (default="${bpe_input_sentence_size}").
+    --bpe_nlsyms              # Non-linguistic symbol list for sentencepiece, separated by a comma. (default="${bpe_nlsyms}").
+    --bpe_char_cover          # Character coverage when modeling BPE (default="${bpe_char_cover}").
+
+    # Language model related
+    --lm_tag          # Suffix to the result dir for language model training (default="${lm_tag}").
+    --lm_exp          # Specify the directory path for LM experiment.
+                      # If this option is specified, lm_tag is ignored (default="${lm_exp}").
+    --lm_stats_dir    # Specify the directory path for LM statistics (default="${lm_stats_dir}").
+    --lm_config       # Config for language model training (default="${lm_config}").
+    --lm_args         # Arguments for language model training (default="${lm_args}").
+                      # e.g., --lm_args "--max_epoch 10"
+                      # Note that it will overwrite args in lm config.
+    --use_word_lm     # Whether to use word language model (default="${use_word_lm}").
+    --word_vocab_size # Size of word vocabulary (default="${word_vocab_size}").
+    --num_splits_lm   # Number of splitting for lm corpus (default="${num_splits_lm}").
+
+    # ASR model related
+    --enh_asr_tag          # Suffix to the result dir for asr model training (default="${enh_asr_tag}").
+    --enh_asr_exp          # Specify the directory path for ASR experiment.
+                       # If this option is specified, enh_asr_tag is ignored (default="${enh_asr_exp}").
+    --enh_asr_stats_dir    # Specify the directory path for ASR statistics (default="${enh_asr_stats_dir}").
+    --enh_asr_config       # Config for asr model training (default="${enh_asr_config}").
+    --enh_asr_args         # Arguments for asr model training (default="${enh_asr_args}").
+                           # e.g., --enh_asr_args "--max_epoch 10"
+                           # Note that it will overwrite args in asr config.
+    --pretrained_model=          # Pretrained model to load (default="${pretrained_model}").
+    --ignore_init_mismatch=      # Ignore mismatch parameter init with pretrained model (default="${ignore_init_mismatch}").
+    --feats_normalize  # Normalizaton layer type (default="${feats_normalize}").
+    --num_splits_asr   # Number of splitting for lm corpus  (default="${num_splits_asr}").
+
+    # Decoding related
+    --inference_tag       # Suffix to the result dir for decoding (default="${inference_tag}").
+    --inference_config    # Config for decoding (default="${inference_config}").
+    --asr_inference_args      # Arguments for decoding (default="${asr_inference_args}").
+                              # e.g., --asr_inference_args "--lm_weight 0.1"
+                              # Note that it will overwrite args in inference config.
+    --enh_inference_args      # Arguments for enhancement (default="${enh_inference_args}").
+    --inference_lm        # Language model path for decoding (default="${inference_lm}").
+    --inference_enh_asr_model # ASR model path for decoding (default="${inference_enh_asr_model}").
+    --download_model      # Download a model from Model Zoo and use it for decoding (default="${download_model}").
+
+    --spk_num             # number of speakers
+    --noise_type_num   # Number of noise types in the input audio (default="${noise_type_num}")
+    --dereverb_ref_num # Number of references for dereverberation (default="${dereverb_ref_num}")
+    --use_dereverb_ref # Whether or not to use dereverberated signal as an additional reference
+                         for training a dereverberation model (default="${use_dereverb_ref}")
+    --use_noise_ref    # Whether or not to use noise signal as an additional reference
+                         for training a denoising model (default="${use_noise_ref}")
+    # Enhancement Evaluation related
+    --scoring_protocol    # Metrics to be used for scoring (default="${scoring_protocol}")
+    --ref_channel         # Reference channel of the reference speech will be used if the model
+                            output is single-channel and reference speech is multi-channel
+                            (default="${ref_channel}")
+    # [Task dependent] Set the datadir name created by local/data.sh
+    --train_set     # Name of training set (required).
+    --valid_set     # Name of validation set used for monitoring/tuning network training (required).
+    --test_sets     # Names of test sets.
+                    # Multiple items (e.g., both dev and eval sets) can be specified (required).
+    --bpe_train_text # Text file path of bpe training set.
+    --lm_train_text  # Text file path of language model training set.
+    --lm_dev_text   # Text file path of language model development set (default="${lm_dev_text}").
+    --lm_test_text  # Text file path of language model evaluation set (default="${lm_test_text}").
+    --nlsyms_txt    # Non-linguistic symbol list if existing (default="${nlsyms_txt}").
+    --cleaner       # Text cleaner (default="${cleaner}").
+    --g2p           # g2p method (default="${g2p}").
+    --lang          # The language type of corpus (default=${lang}).
+    --score_opts             # The options given to sclite scoring (default="{score_opts}").
+    --local_score_opts       # The options given to local/score.sh (default="{local_score_opts}").
+    --enh_asr_speech_fold_length # fold_length for speech data during ASR training (default="${enh_asr_speech_fold_length}").
+    --enh_asr_text_fold_length   # fold_length for text data during ASR training (default="${enh_asr_text_fold_length}").
+    --lm_fold_length         # fold_length for LM training (default="${lm_fold_length}").
+EOF
+)
+
+log "$0 $*"
+# Save command line args for logging (they will be lost after utils/parse_options.sh)
+run_args=$(pyscripts/utils/print_args.py $0 "$@")
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    log "${help_message}"
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+. ./path.sh
+. ./cmd.sh
+
+
+# Check required arguments
+[ -z "${train_set}" ] && { log "${help_message}"; log "Error: --train_set is required"; exit 2; };
+[ -z "${valid_set}" ] && { log "${help_message}"; log "Error: --valid_set is required"; exit 2; };
+[ -z "${test_sets}" ] && { log "${help_message}"; log "Error: --test_sets is required"; exit 2; };
+
+[ ${spk_num} -gt 1 ] && { log "${help_message}"; log "Error: --spk_num only 1 is supported"; exit 2; };
+
+# Check feature type
+if [ "${feats_type}" = raw ]; then
+    data_feats=${dumpdir}/raw
+elif [ "${feats_type}" = fbank_pitch ]; then
+    data_feats=${dumpdir}/fbank_pitch
+elif [ "${feats_type}" = fbank ]; then
+    data_feats=${dumpdir}/fbank
+elif [ "${feats_type}" == extracted ]; then
+    data_feats=${dumpdir}/extracted
+else
+    log "${help_message}"
+    log "Error: not supported: --feats_type ${feats_type}"
+    exit 2
+fi
+
+# Extra files for enhancement process
+utt_extra_files="utt2category text utt2lang"
+
+# Use the same text as ASR for bpe training if not specified.
+[ -z "${bpe_train_text}" ] && bpe_train_text="${data_feats}/${train_set}/text"
+# Use the same text as ASR for lm training if not specified.
+[ -z "${lm_train_text}" ] && lm_train_text="${data_feats}/${train_set}/text"
+# Use the same text as ASR for lm training if not specified.
+[ -z "${lm_dev_text}" ] && lm_dev_text="${data_feats}/${valid_set}/text"
+# Use the text of the 1st evaldir if lm_test is not specified
+[ -z "${lm_test_text}" ] && lm_test_text="${data_feats}/${test_sets%% *}/text"
+
+# Check tokenization type
+if [ "${lang}" != noinfo ]; then
+    token_listdir=data/${lang}_token_list
+else
+    token_listdir=data/token_list
+fi
+bpedir="${token_listdir}/bpe_${bpemode}${nbpe}"
+bpeprefix="${bpedir}"/bpe
+bpemodel="${bpeprefix}".model
+bpetoken_list="${bpedir}"/tokens.txt
+chartoken_list="${token_listdir}"/char/tokens.txt
+# NOTE: keep for future development.
+# shellcheck disable=SC2034
+wordtoken_list="${token_listdir}"/word/tokens.txt
+
+if [ "${token_type}" = bpe ]; then
+    token_list="${bpetoken_list}"
+elif [ "${token_type}" = char ]; then
+    token_list="${chartoken_list}"
+    bpemodel=none
+elif [ "${token_type}" = word ]; then
+    token_list="${wordtoken_list}"
+    bpemodel=none
+else
+    log "Error: not supported --token_type '${token_type}'"
+    exit 2
+fi
+if ${use_word_lm}; then
+    log "Error: Word LM is not supported yet"
+    exit 2
+
+    lm_token_list="${wordtoken_list}"
+    lm_token_type=word
+else
+    lm_token_list="${token_list}"
+    lm_token_type="${token_type}"
+fi
+
+
+# Set tag for naming of model directory
+if [ -z "${enh_asr_tag}" ]; then
+    if [ -n "${enh_asr_config}" ]; then
+        enh_asr_tag="$(basename "${enh_asr_config}" .yaml)_${feats_type}"
+    else
+        enh_asr_tag="train_${feats_type}"
+    fi
+    if [ "${lang}" != noinfo ]; then
+        enh_asr_tag+="_${lang}_${token_type}"
+    else
+        enh_asr_tag+="_${token_type}"
+    fi
+    if [ "${token_type}" = bpe ]; then
+        enh_asr_tag+="${nbpe}"
+    fi
+    # Add overwritten arg's info
+    if [ -n "${enh_asr_args}" ]; then
+        enh_asr_tag+="$(echo "${enh_asr_args}" | sed -e "s/--/\_/g" -e "s/[ |=/]//g")"
+    fi
+    if [ -n "${speed_perturb_factors}" ]; then
+        enh_asr_tag+="_sp"
+    fi
+fi
+if [ -z "${lm_tag}" ]; then
+    if [ -n "${lm_config}" ]; then
+        lm_tag="$(basename "${lm_config}" .yaml)"
+    else
+        lm_tag="train"
+    fi
+    if [ "${lang}" != noinfo ]; then
+        lm_tag+="_${lang}_${lm_token_type}"
+    else
+        lm_tag+="_${lm_token_type}"
+    fi
+    if [ "${lm_token_type}" = bpe ]; then
+        lm_tag+="${nbpe}"
+    fi
+    # Add overwritten arg's info
+    if [ -n "${lm_args}" ]; then
+        lm_tag+="$(echo "${lm_args}" | sed -e "s/--/\_/g" -e "s/[ |=/]//g")"
+    fi
+fi
+
+# The directory used for collect-stats mode
+if [ -z "${enh_asr_stats_dir}" ]; then
+    if [ "${lang}" != noinfo ]; then
+        enh_asr_stats_dir="${expdir}/enh_asr_stats_${feats_type}_${lang}_${token_type}"
+    else
+        enh_asr_stats_dir="${expdir}/enh_asr_stats_${feats_type}_${token_type}"
+    fi
+    if [ "${token_type}" = bpe ]; then
+        enh_asr_stats_dir+="${nbpe}"
+    fi
+    if [ -n "${speed_perturb_factors}" ]; then
+        enh_asr_stats_dir+="_sp"
+    fi
+fi
+if [ -z "${lm_stats_dir}" ]; then
+    if [ "${lang}" != noinfo ]; then
+        lm_stats_dir="${expdir}/lm_stats_${lang}_${lm_token_type}"
+    else
+        lm_stats_dir="${expdir}/lm_stats_${lm_token_type}"
+    fi
+    if [ "${lm_token_type}" = bpe ]; then
+        lm_stats_dir+="${nbpe}"
+    fi
+fi
+# The directory used for training commands
+if [ -z "${enh_asr_exp}" ]; then
+    enh_asr_exp="${expdir}/enh_asr_${enh_asr_tag}"
+fi
+if [ -z "${lm_exp}" ]; then
+    lm_exp="${expdir}/lm_${lm_tag}"
+fi
+if [ -z "${ngram_exp}" ]; then
+    ngram_exp="${expdir}/ngram"
+fi
+
+
+if [ -z "${inference_tag}" ]; then
+    if [ -n "${inference_config}" ]; then
+        inference_tag="$(basename "${inference_config}" .yaml)"
+    else
+        inference_tag=inference
+    fi
+    # Add overwritten arg's info
+    if [ -n "${asr_inference_args}" ]; then
+        inference_tag+="$(echo "${asr_inference_args}" | sed -e "s/--/\_/g" -e "s/[ |=]//g")"
+    fi
+    if [ -n "${enh_inference_args}" ]; then
+        inference_tag+="$(echo "${enh_inference_args}" | sed -e "s/--/\_/g" -e "s/[ |=]//g")"
+    fi
+    if "${use_lm}"; then
+        inference_tag+="_lm_$(basename "${lm_exp}")_$(echo "${inference_lm}" | sed -e "s/\//_/g" -e "s/\.[^.]*$//g")"
+    fi
+    if "${use_ngram}"; then
+        inference_tag+="_ngram_$(basename "${ngram_exp}")_$(echo "${inference_ngram}" | sed -e "s/\//_/g" -e "s/\.[^.]*$//g")"
+    fi
+    inference_tag+="_enh_asr_model_$(echo "${inference_enh_asr_model}" | sed -e "s/\//_/g" -e "s/\.[^.]*$//g")"
+
+    if "${use_k2}"; then
+      inference_tag+="_use_k2"
+    fi
+fi
+
+if [ -z "${inference_enh_tag}" ]; then
+    if [ -n "${inference_enh_config}" ]; then
+        inference_enh_tag="$(basename "${inference_enh_config}" .yaml)"
+    else
+        inference_enh_tag=enhanced
+    fi
+fi
+
+# ========================== Main stages start from here. ==========================
+
+if ! "${skip_data_prep}"; then
+    if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+        log "Stage 1: Data preparation for data/${train_set}, data/${valid_set}, etc."
+        # [Task dependent] Need to create data.sh for new corpus
+        local/data.sh ${local_data_opts}
+    fi
+
+    if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+        if [ -n "${speed_perturb_factors}" ]; then
+           log "Stage 2: Speed perturbation: data/${train_set} -> data/${train_set}_sp"
+
+            _scp_list="wav.scp "
+            for i in $(seq ${spk_num}); do
+                _scp_list+="spk${i}.scp "
+            done
+
+           for factor in ${speed_perturb_factors}; do
+               if [[ $(bc <<<"${factor} != 1.0") == 1 ]]; then
+                   scripts/utils/perturb_enh_data_dir_speed.sh --utt_extra_files "${utt_extra_files}" "${factor}" "data/${train_set}" "data/${train_set}_sp${factor}" "${_scp_list}"
+                   _dirs+="data/${train_set}_sp${factor} "
+               else
+                   # If speed factor is 1, same as the original
+                   _dirs+="data/${train_set} "
+               fi
+           done
+           utils/combine_data.sh --extra-files "${_scp_list}" "data/${train_set}_sp" ${_dirs}
+           for extra_file in ${utt_extra_files}; do
+               python pyscripts/utils/remove_duplicate_keys.py data/"${train_set}_sp"/${extra_file} > data/"${train_set}_sp"/${extra_file}.tmp
+               mv data/"${train_set}_sp"/${extra_file}.tmp data/"${train_set}_sp"/${extra_file}
+            done
+        else
+           log "Skip stage 2: Speed perturbation"
+        fi
+    fi
+
+    if [ -n "${speed_perturb_factors}" ]; then
+        train_set="${train_set}_sp"
+    fi
+
+    if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+        if [ "${feats_type}" = raw ]; then
+            log "Stage 3: Format wav.scp: data/ -> ${data_feats}"
+
+            # ====== Recreating "wav.scp" ======
+            # Kaldi-wav.scp, which can describe the file path with unix-pipe, like "cat /some/path |",
+            # shouldn't be used in training process.
+            # "format_wav_scp.sh" dumps such pipe-style-wav to real audio file
+            # and it can also change the audio-format and sampling rate.
+            # If nothing is need, then format_wav_scp.sh does nothing:
+            # i.e. the input file format and rate is same as the output.
+
+            for dset in "${train_set}" "${valid_set}" ${test_sets}; do
+                if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
+                    _suf="/org"
+                else
+                    _suf=""
+                fi
+                utils/copy_data_dir.sh --validate_opts --non-print data/"${dset}" "${data_feats}${_suf}/${dset}"
+
+                # expand the utt_extra_files for multi-references
+                expand_utt_extra_files=""
+                for extra_file in ${utt_extra_files}; do
+                    # with regex to suuport multi-references
+                    for single_file in "data/${dset}/${extra_file}"*; do
+                        if [ ! -f "${single_file}" ]; then
+                            continue
+                        fi
+                        cp ${single_file} "${data_feats}${_suf}/${dset}"
+                        expand_utt_extra_files="${expand_utt_extra_files} $(basename ${single_file})"
+                    done
+                done
+                echo "${expand_utt_extra_files}"
+
+                rm -f ${data_feats}${_suf}/${dset}/{segments,wav.scp,reco2file_and_channel,reco2dur}
+                _opts=
+                if [ -e data/"${dset}"/segments ]; then
+                    # "segments" is used for splitting wav files which are written in "wav".scp
+                    # into utterances. The file format of segments:
+                    #   <segment_id> <record_id> <start_time> <end_time>
+                    #   "e.g. call-861225-A-0050-0065 call-861225-A 5.0 6.5"
+                    # Where the time is written in seconds.
+                    _opts+="--segments data/${dset}/segments "
+                fi
+
+                _spk_list=" "
+                for i in $(seq ${spk_num}); do
+                    _spk_list+="spk${i} "
+                done
+                if $use_noise_ref && [ -n "${_suf}" ]; then
+                    # references for denoising ("noise1 noise2 ... niose${noise_type_num} ")
+                    _spk_list+=$(for n in $(seq $noise_type_num); do echo -n "noise$n "; done)
+                fi
+                if $use_dereverb_ref && [ -n "${_suf}" ]; then
+                    # references for dereverberation
+                    _spk_list+=$(for n in $(seq $dereverb_ref_num); do echo -n "dereverb$n "; done)
+                fi
+
+                for spk in ${_spk_list} "wav" ; do
+                    # shellcheck disable=SC2086
+                    scripts/audio/format_wav_scp.sh --nj "${nj}" --cmd "${train_cmd}" \
+                        --out-filename "${spk}.scp" \
+                        --ref_channels "${ref_channel}" \
+                        --audio-format "${audio_format}" --fs "${fs}" ${_opts} \
+                        "data/${dset}/${spk}.scp" "${data_feats}${_suf}/${dset}" \
+                        "${data_feats}${_suf}/${dset}/logs/${spk}" "${data_feats}${_suf}/${dset}/data/${spk}"
+
+                done
+
+                echo "${feats_type}" > "${data_feats}${_suf}/${dset}/feats_type"
+
+            done
+
+        else
+            log "Error: not supported: --feats_type ${feats_type}"
+            exit 2
+        fi
+    fi
+
+
+    if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+        log "Stage 4: Remove long/short data: ${data_feats}/org -> ${data_feats}"
+
+        # NOTE(kamo): Not applying to test_sets to keep original data
+        for dset in "${train_set}" "${valid_set}"; do
+
+            _spk_list=" "
+            _scp_list=" "
+            for i in $(seq ${spk_num}); do
+                _spk_list+="spk${i} "
+                _scp_list+="spk${i}.scp "
+            done
+            if $use_noise_ref; then
+                # references for denoising ("noise1 noise2 ... niose${noise_type_num} ")
+                _spk_list+=$(for n in $(seq $noise_type_num); do echo -n "noise$n "; done)
+                _scp_list+=$(for n in $(seq $noise_type_num); do echo -n "noise$n.scp "; done)
+            fi
+            if $use_dereverb_ref; then
+                # references for dereverberation
+                _spk_list+=$(for n in $(seq $dereverb_ref_num); do echo -n "dereverb$n "; done)
+                _scp_list+=$(for n in $(seq $dereverb_ref_num); do echo -n "dereverb$n.scp "; done)
+            fi
+
+            # Copy data dir
+            utils/copy_data_dir.sh --validate_opts --non-print "${data_feats}/org/${dset}" "${data_feats}/${dset}"
+            cp "${data_feats}/org/${dset}/feats_type" "${data_feats}/${dset}/feats_type"
+
+            for utt_extra_file in ${utt_extra_files}; do
+                if [ -f "${data_feats}/org/${dset}/${utt_extra_file}" ]; then
+                    cp "${data_feats}/org/${dset}/${utt_extra_file}" "${data_feats}/${dset}"
+                fi
+            done
+
+            # Remove short utterances
+            _feats_type="$(<${data_feats}/${dset}/feats_type)"
+            if [ "${_feats_type}" = raw ]; then
+                _fs=$(python3 -c "import humanfriendly as h;print(h.parse_size('${fs}'))")
+                _min_length=$(python3 -c "print(int(${min_wav_duration} * ${_fs}))")
+                _max_length=$(python3 -c "print(int(${max_wav_duration} * ${_fs}))")
+
+                # utt2num_samples is created by format_wav_scp.sh
+                <"${data_feats}/org/${dset}/utt2num_samples" \
+                    awk -v min_length="${_min_length}" -v max_length="${_max_length}" \
+                        '{ if ($2 > min_length && $2 < max_length ) print $0; }' \
+                        >"${data_feats}/${dset}/utt2num_samples"
+                for spk in ${_spk_list} "wav"; do
+                    <"${data_feats}/org/${dset}/${spk}.scp" \
+                        utils/filter_scp.pl "${data_feats}/${dset}/utt2num_samples"  \
+                        >"${data_feats}/${dset}/${spk}.scp"
+                done
+            else
+                # Get frame shift in ms from conf/fbank.conf
+                _frame_shift=
+                if [ -f conf/fbank.conf ] && [ "$(<conf/fbank.conf grep -c frame-shift)" -gt 0 ]; then
+                    # Assume using conf/fbank.conf for feature extraction
+                    _frame_shift="$(<conf/fbank.conf grep frame-shift | sed -e 's/[-a-z =]*\([0-9]*\)/\1/g')"
+                fi
+                if [ -z "${_frame_shift}" ]; then
+                    # If not existing, use the default number in Kaldi (=10ms).
+                    # If you are using different number, you have to change the following value manually.
+                    _frame_shift=10
+                fi
+
+                _min_length=$(python3 -c "print(int(${min_wav_duration} / ${_frame_shift} * 1000))")
+                _max_length=$(python3 -c "print(int(${max_wav_duration} / ${_frame_shift} * 1000))")
+
+                cp "${data_feats}/org/${dset}/feats_dim" "${data_feats}/${dset}/feats_dim"
+                <"${data_feats}/org/${dset}/feats_shape" awk -F, ' { print $1 } ' \
+                    | awk -v min_length="${_min_length}" -v max_length="${_max_length}" \
+                        '{ if ($2 > min_length && $2 < max_length) print $0; }' \
+                        >"${data_feats}/${dset}/feats_shape"
+                <"${data_feats}/org/${dset}/feats.scp" \
+                    utils/filter_scp.pl "${data_feats}/${dset}/feats_shape"  \
+                    >"${data_feats}/${dset}/feats.scp"
+            fi
+
+            # Remove empty text
+            <"${data_feats}/org/${dset}/text" \
+                awk ' { if( NF != 1 ) print $0; } ' >"${data_feats}/${dset}/text"
+
+            # fix_data_dir.sh leaves only utts which exist in all files
+            utils/fix_data_dir.sh --utt_extra_files "${utt_extra_files}" "${data_feats}/${dset}"
+        done
+
+        # shellcheck disable=SC2002
+        cat ${lm_train_text} | awk ' { if( NF != 1 ) print $0; } ' > "${data_feats}/lm_train.txt"
+    fi
+
+
+    if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+        if [ "${token_type}" = bpe ]; then
+            log "Stage 5: Generate token_list from ${bpe_train_text} using BPE"
+
+            mkdir -p "${bpedir}"
+            # shellcheck disable=SC2002
+            cat ${bpe_train_text} | cut -f 2- -d" "  > "${bpedir}"/train.txt
+
+            if [ -n "${bpe_nlsyms}" ]; then
+                _opts_spm="--user_defined_symbols=${bpe_nlsyms}"
+            else
+                _opts_spm=""
+            fi
+
+            spm_train \
+                --input="${bpedir}"/train.txt \
+                --vocab_size="${nbpe}" \
+                --model_type="${bpemode}" \
+                --model_prefix="${bpeprefix}" \
+                --character_coverage=${bpe_char_cover} \
+                --input_sentence_size="${bpe_input_sentence_size}" \
+                ${_opts_spm}
+
+            {
+            echo "${blank}"
+            echo "${oov}"
+            # Remove <unk>, <s>, </s> from the vocabulary
+            <"${bpeprefix}".vocab awk '{ if( NR != 1 && NR != 2 && NR != 3 ){ print $1; } }'
+            echo "${sos_eos}"
+            } > "${token_list}"
+
+        elif [ "${token_type}" = char ] || [ "${token_type}" = word ]; then
+            log "Stage 5: Generate character level token_list from ${lm_train_text}"
+
+            _opts="--non_linguistic_symbols ${nlsyms_txt}"
+
+            # The first symbol in token_list must be "<blank>" and the last must be also sos/eos:
+            # 0 is reserved for CTC-blank for ASR and also used as ignore-index in the other task
+            ${python} -m espnet2.bin.tokenize_text  \
+                --token_type "${token_type}" \
+                --input "${data_feats}/lm_train.txt" --output "${token_list}" ${_opts} \
+                --field 2- \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --write_vocabulary true \
+                --add_symbol "${blank}:0" \
+                --add_symbol "${oov}:1" \
+                --add_symbol "${sos_eos}:-1"
+
+        else
+            log "Error: not supported --token_type '${token_type}'"
+            exit 2
+        fi
+
+        # Create word-list for word-LM training
+        if ${use_word_lm} && [ "${token_type}" != word ]; then
+            log "Generate word level token_list from ${data_feats}/lm_train.txt"
+            ${python} -m espnet2.bin.tokenize_text \
+                --token_type word \
+                --input "${data_feats}/lm_train.txt" --output "${lm_token_list}" \
+                --field 2- \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --write_vocabulary true \
+                --vocabulary_size "${word_vocab_size}" \
+                --add_symbol "${blank}:0" \
+                --add_symbol "${oov}:1" \
+                --add_symbol "${sos_eos}:-1"
+        fi
+
+    fi
+else
+    log "Skip the stages for data preparation"
+fi
+
+
+# ========================== Data preparation is done here. ==========================
+
+
+if ! "${skip_train}"; then
+    if "${use_lm}"; then
+        if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+            log "Stage 6: LM collect stats: train_set=${data_feats}/lm_train.txt, dev_set=${lm_dev_text}"
+
+            _opts=
+            if [ -n "${lm_config}" ]; then
+                # To generate the config file: e.g.
+                #   % python3 -m espnet2.bin.lm_train --print_config --optim adam
+                _opts+="--config ${lm_config} "
+            fi
+
+            # 1. Split the key file
+            _logdir="${lm_stats_dir}/logdir"
+            mkdir -p "${_logdir}"
+            # Get the minimum number among ${nj} and the number lines of input files
+            _nj=$(min "${nj}" "$(<${data_feats}/lm_train.txt wc -l)" "$(<${lm_dev_text} wc -l)")
+
+            key_file="${data_feats}/lm_train.txt"
+            split_scps=""
+            for n in $(seq ${_nj}); do
+                split_scps+=" ${_logdir}/train.${n}.scp"
+            done
+            # shellcheck disable=SC2086
+            utils/split_scp.pl "${key_file}" ${split_scps}
+
+            key_file="${lm_dev_text}"
+            split_scps=""
+            for n in $(seq ${_nj}); do
+                split_scps+=" ${_logdir}/dev.${n}.scp"
+            done
+            # shellcheck disable=SC2086
+            utils/split_scp.pl "${key_file}" ${split_scps}
+
+            # 2. Generate run.sh
+            log "Generate '${lm_stats_dir}/run.sh'. You can resume the process from stage 6 using this script"
+            mkdir -p "${lm_stats_dir}"; echo "${run_args} --stage 6 \"\$@\"; exit \$?" > "${lm_stats_dir}/run.sh"; chmod +x "${lm_stats_dir}/run.sh"
+
+            # 3. Submit jobs
+            log "LM collect-stats started... log: '${_logdir}/stats.*.log'"
+            # NOTE: --*_shape_file doesn't require length information if --batch_type=unsorted,
+            #       but it's used only for deciding the sample ids.
+            # shellcheck disable=SC2086
+            ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \
+                ${python} -m espnet2.bin.lm_train \
+                    --collect_stats true \
+                    --use_preprocessor true \
+                    --bpemodel "${bpemodel}" \
+                    --token_type "${lm_token_type}"\
+                    --token_list "${lm_token_list}" \
+                    --non_linguistic_symbols "${nlsyms_txt}" \
+                    --cleaner "${cleaner}" \
+                    --g2p "${g2p}" \
+                    --train_data_path_and_name_and_type "${data_feats}/lm_train.txt,text,text" \
+                    --valid_data_path_and_name_and_type "${lm_dev_text},text,text" \
+                    --train_shape_file "${_logdir}/train.JOB.scp" \
+                    --valid_shape_file "${_logdir}/dev.JOB.scp" \
+                    --output_dir "${_logdir}/stats.JOB" \
+                    ${_opts} ${lm_args} || { cat "${_logdir}"/stats.1.log; exit 1; }
+
+            # 4. Aggregate shape files
+            _opts=
+            for i in $(seq "${_nj}"); do
+                _opts+="--input_dir ${_logdir}/stats.${i} "
+            done
+            # shellcheck disable=SC2086
+            ${python} -m espnet2.bin.aggregate_stats_dirs ${_opts} --output_dir "${lm_stats_dir}"
+
+            # Append the num-tokens at the last dimensions. This is used for batch-bins count
+            <"${lm_stats_dir}/train/text_shape" \
+                awk -v N="$(<${lm_token_list} wc -l)" '{ print $0 "," N }' \
+                >"${lm_stats_dir}/train/text_shape.${lm_token_type}"
+
+            <"${lm_stats_dir}/valid/text_shape" \
+                awk -v N="$(<${lm_token_list} wc -l)" '{ print $0 "," N }' \
+                >"${lm_stats_dir}/valid/text_shape.${lm_token_type}"
+        fi
+
+
+        if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+            log "Stage 7: LM Training: train_set=${data_feats}/lm_train.txt, dev_set=${lm_dev_text}"
+
+            _opts=
+            if [ -n "${lm_config}" ]; then
+                # To generate the config file: e.g.
+                #   % python3 -m espnet2.bin.lm_train --print_config --optim adam
+                _opts+="--config ${lm_config} "
+            fi
+
+            if [ "${num_splits_lm}" -gt 1 ]; then
+                # If you met a memory error when parsing text files, this option may help you.
+                # The corpus is split into subsets and each subset is used for training one by one in order,
+                # so the memory footprint can be limited to the memory required for each dataset.
+
+                _split_dir="${lm_stats_dir}/splits${num_splits_lm}"
+                if [ ! -f "${_split_dir}/.done" ]; then
+                    rm -f "${_split_dir}/.done"
+                    ${python} -m espnet2.bin.split_scps \
+                      --scps "${data_feats}/lm_train.txt" "${lm_stats_dir}/train/text_shape.${lm_token_type}" \
+                      --num_splits "${num_splits_lm}" \
+                      --output_dir "${_split_dir}"
+                    touch "${_split_dir}/.done"
+                else
+                    log "${_split_dir}/.done exists. Spliting is skipped"
+                fi
+
+                _opts+="--train_data_path_and_name_and_type ${_split_dir}/lm_train.txt,text,text "
+                _opts+="--train_shape_file ${_split_dir}/text_shape.${lm_token_type} "
+                _opts+="--multiple_iterator true "
+
+            else
+                _opts+="--train_data_path_and_name_and_type ${data_feats}/lm_train.txt,text,text "
+                _opts+="--train_shape_file ${lm_stats_dir}/train/text_shape.${lm_token_type} "
+            fi
+
+            # NOTE(kamo): --fold_length is used only if --batch_type=folded and it's ignored in the other case
+
+            log "Generate '${lm_exp}/run.sh'. You can resume the process from stage 7 using this script"
+            mkdir -p "${lm_exp}"; echo "${run_args} --stage 7 \"\$@\"; exit \$?" > "${lm_exp}/run.sh"; chmod +x "${lm_exp}/run.sh"
+
+            log "LM training started... log: '${lm_exp}/train.log'"
+            if echo "${cuda_cmd}" | grep -e queue.pl -e queue-freegpu.pl &> /dev/null; then
+                # SGE can't include "/" in a job name
+                jobname="$(basename ${lm_exp})"
+            else
+                jobname="${lm_exp}/train.log"
+            fi
+
+            # shellcheck disable=SC2086
+            ${python} -m espnet2.bin.launch \
+                --cmd "${cuda_cmd} --name ${jobname}" \
+                --log "${lm_exp}"/train.log \
+                --ngpu "${ngpu}" \
+                --num_nodes "${num_nodes}" \
+                --init_file_prefix "${lm_exp}"/.dist_init_ \
+                --multiprocessing_distributed true -- \
+                ${python} -m espnet2.bin.lm_train \
+                    --ngpu "${ngpu}" \
+                    --use_preprocessor true \
+                    --bpemodel "${bpemodel}" \
+                    --token_type "${lm_token_type}"\
+                    --token_list "${lm_token_list}" \
+                    --non_linguistic_symbols "${nlsyms_txt}" \
+                    --cleaner "${cleaner}" \
+                    --g2p "${g2p}" \
+                    --valid_data_path_and_name_and_type "${lm_dev_text},text,text" \
+                    --valid_shape_file "${lm_stats_dir}/valid/text_shape.${lm_token_type}" \
+                    --fold_length "${lm_fold_length}" \
+                    --resume true \
+                    --output_dir "${lm_exp}" \
+                    ${_opts} ${lm_args}
+
+        fi
+
+
+        if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
+            log "Stage 8: Calc perplexity: ${lm_test_text}"
+            _opts=
+            # TODO(kamo): Parallelize?
+            log "Perplexity calculation started... log: '${lm_exp}/perplexity_test/lm_calc_perplexity.log'"
+            # shellcheck disable=SC2086
+            ${cuda_cmd} --gpu "${ngpu}" "${lm_exp}"/perplexity_test/lm_calc_perplexity.log \
+                ${python} -m espnet2.bin.lm_calc_perplexity \
+                    --ngpu "${ngpu}" \
+                    --data_path_and_name_and_type "${lm_test_text},text,text" \
+                    --train_config "${lm_exp}"/config.yaml \
+                    --model_file "${lm_exp}/${inference_lm}" \
+                    --output_dir "${lm_exp}/perplexity_test" \
+                    ${_opts}
+            log "PPL: ${lm_test_text}: $(cat ${lm_exp}/perplexity_test/ppl)"
+
+        fi
+
+    else
+        log "Stage 6-8: Skip lm-related stages: use_lm=${use_lm}"
+    fi
+
+
+    if "${use_ngram}"; then
+        mkdir -p ${ngram_exp}
+    fi
+    if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
+        if "${use_ngram}"; then
+            log "Stage 9: Ngram Training: train_set=${data_feats}/lm_train.txt"
+            cut -f 2 -d " " ${data_feats}/lm_train.txt | lmplz -S "20%" --discount_fallback -o ${ngram_num} - >${ngram_exp}/${ngram_num}gram.arpa
+            build_binary -s ${ngram_exp}/${ngram_num}gram.arpa ${ngram_exp}/${ngram_num}gram.bin 
+        else
+            log "Stage 9: Skip ngram stages: use_ngram=${use_ngram}"
+        fi
+    fi
+
+
+    if [ ${stage} -le 10 ] && [ ${stop_stage} -ge 10 ]; then
+        _enh_asr_train_dir="${data_feats}/${train_set}"
+        _enh_asr_valid_dir="${data_feats}/${valid_set}"
+        log "Stage 10: ASR collect stats: train_set=${_enh_asr_train_dir}, valid_set=${_enh_asr_valid_dir}"
+
+        _opts=
+        if [ -n "${enh_asr_config}" ]; then
+            # To generate the config file: e.g.
+            #   % python3 -m espnet2.bin.enh_s2t_train --print_config --optim adam
+            _opts+="--config ${enh_asr_config} "
+        fi
+
+        _feats_type="$(<${_enh_asr_train_dir}/feats_type)"
+        if [ "${_feats_type}" = raw ]; then
+            _scp=wav.scp
+            if [[ "${audio_format}" == *ark* ]]; then
+                _type=kaldi_ark
+            else
+                # "sound" supports "wav", "flac", etc.
+                _type=sound
+            fi
+            _opts+="--frontend_conf fs=${fs} "
+        else
+            _scp=feats.scp
+            _type=kaldi_ark
+            _input_size="$(<${_enh_asr_train_dir}/feats_dim)"
+            _opts+="--input_size=${_input_size} "
+        fi
+
+        # 1. Split the key file
+        _logdir="${enh_asr_stats_dir}/logdir"
+        mkdir -p "${_logdir}"
+
+        # Get the minimum number among ${nj} and the number lines of input files
+        _nj=$(min "${nj}" "$(<${_enh_asr_train_dir}/${_scp} wc -l)" "$(<${_enh_asr_valid_dir}/${_scp} wc -l)")
+
+        key_file="${_enh_asr_train_dir}/${_scp}"
+        split_scps=""
+        for n in $(seq "${_nj}"); do
+            split_scps+=" ${_logdir}/train.${n}.scp"
+        done
+        # shellcheck disable=SC2086
+        utils/split_scp.pl "${key_file}" ${split_scps}
+
+        key_file="${_enh_asr_valid_dir}/${_scp}"
+        split_scps=""
+        for n in $(seq "${_nj}"); do
+            split_scps+=" ${_logdir}/valid.${n}.scp"
+        done
+        # shellcheck disable=SC2086
+        utils/split_scp.pl "${key_file}" ${split_scps}
+
+        # 2. Generate run.sh
+        log "Generate '${enh_asr_stats_dir}/run.sh'. You can resume the process from stage 10 using this script"
+        mkdir -p "${enh_asr_stats_dir}"; echo "${run_args} --stage 10 \"\$@\"; exit \$?" > "${enh_asr_stats_dir}/run.sh"; chmod +x "${enh_asr_stats_dir}/run.sh"
+
+        # 3. Submit jobs
+        log "ASR collect-stats started... log: '${_logdir}/stats.*.log'"
+
+        # NOTE: --*_shape_file doesn't require length information if --batch_type=unsorted,
+        #       but it's used only for deciding the sample ids.
+
+        # shellcheck disable=SC2086
+        ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \
+            ${python} -m espnet2.bin.enh_s2t_train \
+                --collect_stats true \
+                --use_preprocessor true \
+                --bpemodel "${bpemodel}" \
+                --token_type "${token_type}" \
+                --token_list "${token_list}" \
+                --non_linguistic_symbols "${nlsyms_txt}" \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --train_data_path_and_name_and_type "${_enh_asr_train_dir}/${_scp},speech,${_type}" \
+                --train_data_path_and_name_and_type "${_enh_asr_train_dir}/${_scp},speech_ref1,${_type}" \
+                --train_data_path_and_name_and_type "${_enh_asr_train_dir}/text,text,text" \
+                --valid_data_path_and_name_and_type "${_enh_asr_valid_dir}/${_scp},speech,${_type}" \
+                --valid_data_path_and_name_and_type "${_enh_asr_valid_dir}/${_scp},speech_ref1,${_type}" \
+                --valid_data_path_and_name_and_type "${_enh_asr_valid_dir}/text,text,text" \
+                --train_shape_file "${_logdir}/train.JOB.scp" \
+                --valid_shape_file "${_logdir}/valid.JOB.scp" \
+                --output_dir "${_logdir}/stats.JOB" \
+                ${_opts} ${enh_asr_args} || { cat "${_logdir}"/stats.1.log; exit 1; }
+
+        # 4. Aggregate shape files
+        _opts=
+        for i in $(seq "${_nj}"); do
+            _opts+="--input_dir ${_logdir}/stats.${i} "
+        done
+        # shellcheck disable=SC2086
+        ${python} -m espnet2.bin.aggregate_stats_dirs ${_opts} --output_dir "${enh_asr_stats_dir}"
+
+        # Append the num-tokens at the last dimensions. This is used for batch-bins count
+        <"${enh_asr_stats_dir}/train/text_shape" \
+            awk -v N="$(<${token_list} wc -l)" '{ print $0 "," N }' \
+            >"${enh_asr_stats_dir}/train/text_shape.${token_type}"
+
+        <"${enh_asr_stats_dir}/valid/text_shape" \
+            awk -v N="$(<${token_list} wc -l)" '{ print $0 "," N }' \
+            >"${enh_asr_stats_dir}/valid/text_shape.${token_type}"
+    fi
+
+
+    if [ ${stage} -le 11 ] && [ ${stop_stage} -ge 11 ]; then
+        _enh_asr_train_dir="${data_feats}/${train_set}"
+        _enh_asr_valid_dir="${data_feats}/${valid_set}"
+        log "Stage 11: ASR Training: train_set=${_enh_asr_train_dir}, valid_set=${_enh_asr_valid_dir}"
+
+        _opts=
+        if [ -n "${enh_asr_config}" ]; then
+            # To generate the config file: e.g.
+            #   % python3 -m espnet2.bin.enh_s2t_train --print_config --optim adam
+            _opts+="--config ${enh_asr_config} "
+        fi
+
+        _feats_type="$(<${_enh_asr_train_dir}/feats_type)"
+        if [ "${_feats_type}" = raw ]; then
+            _scp=wav.scp
+            # "sound" supports "wav", "flac", etc.
+            if [[ "${audio_format}" == *ark* ]]; then
+                _type=kaldi_ark
+            else
+                _type=sound
+            fi
+            _fold_length="$((enh_asr_speech_fold_length * 100))"
+            _opts+="--frontend_conf fs=${fs} "
+        else
+            _scp=feats.scp
+            _type=kaldi_ark
+            _fold_length="${enh_asr_speech_fold_length}"
+            _input_size="$(<${_enh_asr_train_dir}/feats_dim)"
+            _opts+="--input_size=${_input_size} "
+
+        fi
+        if [ "${feats_normalize}" = global_mvn ]; then
+            # Default normalization is utterance_mvn and changes to global_mvn
+            _opts+="--normalize=global_mvn --normalize_conf stats_file=${enh_asr_stats_dir}/train/feats_stats.npz "
+        fi
+
+        if [ "${num_splits_asr}" -gt 1 ]; then
+            # If you met a memory error when parsing text files, this option may help you.
+            # The corpus is split into subsets and each subset is used for training one by one in order,
+            # so the memory footprint can be limited to the memory required for each dataset.
+
+            _split_dir="${enh_asr_stats_dir}/splits${num_splits_asr}"
+            if [ ! -f "${_split_dir}/.done" ]; then
+                rm -f "${_split_dir}/.done"
+                ${python} -m espnet2.bin.split_scps \
+                  --scps \
+                      "${_enh_asr_train_dir}/${_scp}" \
+                      "${_enh_asr_train_dir}/text" \
+                      "${enh_asr_stats_dir}/train/speech_shape" \
+                      "${enh_asr_stats_dir}/train/speech_ref1_shape" \
+                      "${enh_asr_stats_dir}/train/text_shape.${token_type}" \
+                  --num_splits "${num_splits_asr}" \
+                  --output_dir "${_split_dir}"
+                touch "${_split_dir}/.done"
+            else
+                log "${_split_dir}/.done exists. Spliting is skipped"
+            fi
+
+            _opts+="--train_data_path_and_name_and_type ${_split_dir}/${_scp},speech,${_type} "
+            _opts+="--train_data_path_and_name_and_type ${_split_dir}/spk1.scp,speech_ref1,${_type} "
+            _opts+="--train_data_path_and_name_and_type ${_split_dir}/text,text,text "
+            _opts+="--train_shape_file ${_split_dir}/speech_shape "
+            _opts+="--train_shape_file ${_split_dir}/speech_ref1_shape "
+            _opts+="--train_shape_file ${_split_dir}/text_shape.${token_type} "
+            _opts+="--multiple_iterator true "
+
+        else
+            _opts+="--train_data_path_and_name_and_type ${_enh_asr_train_dir}/${_scp},speech,${_type} "
+            _opts+="--train_data_path_and_name_and_type ${_enh_asr_train_dir}/spk1.scp,speech_ref1,${_type} "
+            _opts+="--train_data_path_and_name_and_type ${_enh_asr_train_dir}/text,text,text "
+            _opts+="--train_shape_file ${enh_asr_stats_dir}/train/speech_shape "
+            _opts+="--train_shape_file ${enh_asr_stats_dir}/train/speech_ref1_shape "
+            _opts+="--train_shape_file ${enh_asr_stats_dir}/train/text_shape.${token_type} "
+        fi
+
+        log "Generate '${enh_asr_exp}/run.sh'. You can resume the process from stage 11 using this script"
+        mkdir -p "${enh_asr_exp}"; echo "${run_args} --stage 11 \"\$@\"; exit \$?" > "${enh_asr_exp}/run.sh"; chmod +x "${enh_asr_exp}/run.sh"
+
+        # NOTE(kamo): --fold_length is used only if --batch_type=folded and it's ignored in the other case
+        log "ASR training started... log: '${enh_asr_exp}/train.log'"
+        if echo "${cuda_cmd}" | grep -e queue.pl -e queue-freegpu.pl &> /dev/null; then
+            # SGE can't include "/" in a job name
+            jobname="$(basename ${enh_asr_exp})"
+        else
+            jobname="${enh_asr_exp}/train.log"
+        fi
+
+        # shellcheck disable=SC2086
+        ${python} -m espnet2.bin.launch \
+            --cmd "${cuda_cmd} --name ${jobname}" \
+            --log "${enh_asr_exp}"/train.log \
+            --ngpu "${ngpu}" \
+            --num_nodes "${num_nodes}" \
+            --init_file_prefix "${enh_asr_exp}"/.dist_init_ \
+            --multiprocessing_distributed true -- \
+            ${python} -m espnet2.bin.enh_s2t_train \
+                --use_preprocessor true \
+                --bpemodel "${bpemodel}" \
+                --token_type "${token_type}" \
+                --token_list "${token_list}" \
+                --non_linguistic_symbols "${nlsyms_txt}" \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --valid_data_path_and_name_and_type "${_enh_asr_valid_dir}/${_scp},speech,${_type}" \
+                --valid_data_path_and_name_and_type "${_enh_asr_valid_dir}/spk1.scp,speech_ref1,${_type}" \
+                --valid_data_path_and_name_and_type "${_enh_asr_valid_dir}/text,text,text" \
+                --valid_shape_file "${enh_asr_stats_dir}/valid/speech_shape" \
+                --valid_shape_file "${enh_asr_stats_dir}/valid/speech_ref1_shape" \
+                --valid_shape_file "${enh_asr_stats_dir}/valid/text_shape.${token_type}" \
+                --resume true \
+                --init_param ${pretrained_model} \
+                --ignore_init_mismatch ${ignore_init_mismatch} \
+                --fold_length "${_fold_length}" \
+                --fold_length "${_fold_length}" \
+                --fold_length "${enh_asr_text_fold_length}" \
+                --output_dir "${enh_asr_exp}" \
+                ${_opts} ${enh_asr_args}
+
+    fi
+else
+    log "Skip the training stages"
+fi
+
+
+if [ -n "${download_model}" ]; then
+    log "Use ${download_model} for decoding and evaluation"
+    enh_asr_exp="${expdir}/${download_model}"
+    mkdir -p "${enh_asr_exp}"
+
+    # If the model already exists, you can skip downloading
+    espnet_model_zoo_download --unpack true "${download_model}" > "${enh_asr_exp}/config.txt"
+
+    # Get the path of each file
+    _enh_asr_model_file=$(<"${enh_asr_exp}/config.txt" sed -e "s/.*'enh_s2t_model_file': '\([^']*\)'.*$/\1/")
+    _enh_asr_train_config=$(<"${enh_asr_exp}/config.txt" sed -e "s/.*'enh_s2t_train_config': '\([^']*\)'.*$/\1/")
+
+    # Create symbolic links
+    ln -sf "${_enh_asr_model_file}" "${enh_asr_exp}"
+    ln -sf "${_enh_asr_train_config}" "${enh_asr_exp}"
+    inference_enh_asr_model=$(basename "${_enh_asr_model_file}")
+
+    if [ "$(<${enh_asr_exp}/config.txt grep -c lm_file)" -gt 0 ]; then
+        _lm_file=$(<"${enh_asr_exp}/config.txt" sed -e "s/.*'lm_file': '\([^']*\)'.*$/\1/")
+        _lm_train_config=$(<"${enh_asr_exp}/config.txt" sed -e "s/.*'lm_train_config': '\([^']*\)'.*$/\1/")
+
+        lm_exp="${expdir}/${download_model}/lm"
+        mkdir -p "${lm_exp}"
+
+        ln -sf "${_lm_file}" "${lm_exp}"
+        ln -sf "${_lm_train_config}" "${lm_exp}"
+        inference_lm=$(basename "${_lm_file}")
+    fi
+
+fi
+
+
+if ! "${skip_eval}"; then
+    if [ ${stage} -le 12 ] && [ ${stop_stage} -ge 12 ]; then
+        log "Stage 12: Decoding: training_dir=${enh_asr_exp}"
+
+        if ${gpu_inference}; then
+            _cmd="${cuda_cmd}"
+            _ngpu=1
+        else
+            _cmd="${decode_cmd}"
+            _ngpu=0
+        fi
+
+        _opts=
+        if [ -n "${inference_config}" ]; then
+            _opts+="--config ${inference_config} "
+        fi
+        if "${use_lm}"; then
+            if "${use_word_lm}"; then
+                _opts+="--word_lm_train_config ${lm_exp}/config.yaml "
+                _opts+="--word_lm_file ${lm_exp}/${inference_lm} "
+            else
+                _opts+="--lm_train_config ${lm_exp}/config.yaml "
+                _opts+="--lm_file ${lm_exp}/${inference_lm} "
+            fi
+        fi
+        if "${use_ngram}"; then
+             _opts+="--ngram_file ${ngram_exp}/${inference_ngram}"
+        fi
+
+        # 2. Generate run.sh
+        log "Generate '${enh_asr_exp}/${inference_tag}/run.sh'. You can resume the process from stage 12 using this script"
+        mkdir -p "${enh_asr_exp}/${inference_tag}"; echo "${run_args} --stage 12 \"\$@\"; exit \$?" > "${enh_asr_exp}/${inference_tag}/run.sh"; chmod +x "${enh_asr_exp}/${inference_tag}/run.sh"
+
+        for dset in ${test_sets}; do
+            _data="${data_feats}/${dset}"
+            _dir="${enh_asr_exp}/${inference_tag}/${dset}"
+            _logdir="${_dir}/logdir"
+            mkdir -p "${_logdir}"
+
+            _feats_type="$(<${_data}/feats_type)"
+            if [ "${_feats_type}" = raw ]; then
+                _scp=wav.scp
+                if [[ "${audio_format}" == *ark* ]]; then
+                    _type=kaldi_ark
+                else
+                    _type=sound
+                fi
+            else
+                _scp=feats.scp
+                _type=kaldi_ark
+            fi
+
+            # 1. Split the key file
+            key_file=${_data}/${_scp}
+            split_scps=""
+            _nj=$(min "${inference_nj}" "$(<${key_file} wc -l)")
+            asr_inference_tool="espnet2.bin.asr_inference"
+
+            for n in $(seq "${_nj}"); do
+                split_scps+=" ${_logdir}/keys.${n}.scp"
+            done
+            # shellcheck disable=SC2086
+            utils/split_scp.pl "${key_file}" ${split_scps}
+
+            # 2. Submit decoding jobs
+            log "Decoding started... log: '${_logdir}/asr_inference.*.log'"
+            # shellcheck disable=SC2086
+            ${_cmd} --gpu "${_ngpu}" JOB=1:"${_nj}" "${_logdir}"/asr_inference.JOB.log \
+                ${python} -m ${asr_inference_tool} \
+                    --enh_s2t_task true \
+                    --batch_size ${batch_size} \
+                    --ngpu "${_ngpu}" \
+                    --data_path_and_name_and_type "${_data}/${_scp},speech,${_type}" \
+                    --key_file "${_logdir}"/keys.JOB.scp \
+                    --asr_train_config "${enh_asr_exp}"/config.yaml \
+                    --asr_model_file "${enh_asr_exp}"/"${inference_enh_asr_model}" \
+                    --output_dir "${_logdir}"/output.JOB \
+                    ${_opts} ${asr_inference_args}
+
+            # 3. Concatenates the output files from each jobs
+            for f in token token_int score text; do
+                for i in $(seq "${_nj}"); do
+                    cat "${_logdir}/output.${i}/1best_recog/${f}"
+                done | LC_ALL=C sort -k1 >"${_dir}/${f}"
+            done
+
+        done
+    fi
+
+
+    if [ ${stage} -le 13 ] && [ ${stop_stage} -ge 13 ]; then
+        log "Stage 13: Enhance Speech: training_dir=${enh_asr_exp}"
+
+        if ${gpu_inference}; then
+            _cmd="${cuda_cmd}"
+            _ngpu=1
+        else
+            _cmd="${decode_cmd}"
+            _ngpu=0
+        fi
+
+        _opts=
+
+        # 2. Generate run.sh
+        log "Generate '${enh_asr_exp}/run_enhance.sh'. You can resume the process from stage 13 using this script"
+        mkdir -p "${enh_asr_exp}"; echo "${run_args} --stage 13 \"\$@\"; exit \$?" > "${enh_asr_exp}/run_enhance.sh"; chmod +x "${enh_asr_exp}/run_enhance.sh"
+
+        for dset in ${test_sets}; do
+            _data="${data_feats}/${dset}"
+            _dir="${enh_asr_exp}/${inference_enh_tag}_${dset}"
+            _logdir="${_dir}/logdir"
+            mkdir -p "${_logdir}"
+
+            _scp=wav.scp
+            if [[ "${audio_format}" == *ark* ]]; then
+                _type=kaldi_ark
+            else
+                _type=sound
+            fi
+
+            # 1. Split the key file
+            key_file=${_data}/${_scp}
+            split_scps=""
+            _nj=$(min "${inference_nj}" "$(<${key_file} wc -l)")
+            for n in $(seq "${_nj}"); do
+                split_scps+=" ${_logdir}/keys.${n}.scp"
+            done
+            # shellcheck disable=SC2086
+            utils/split_scp.pl "${key_file}" ${split_scps}
+
+            # 2. Submit inference jobs
+            log "Enhancement started... log: '${_logdir}/enh_inference.*.log'"
+            # shellcheck disable=SC2086
+            ${_cmd} --gpu "${_ngpu}" JOB=1:"${_nj}" "${_logdir}"/enh_inference.JOB.log \
+                ${python} -m espnet2.bin.enh_inference \
+                    --enh_s2t_task true \
+                    --ngpu "${_ngpu}" \
+                    --fs "${fs}" \
+                    --data_path_and_name_and_type "${_data}/${_scp},speech_mix,${_type}" \
+                    --key_file "${_logdir}"/keys.JOB.scp \
+                    --train_config "${enh_asr_exp}"/config.yaml \
+                    ${inference_enh_config:+--inference_config "$inference_enh_config"} \
+                    --model_file "${enh_asr_exp}"/"${inference_enh_asr_model}" \
+                    --output_dir "${_logdir}"/output.JOB \
+                    ${_opts} ${enh_inference_args}
+
+            # 3. Concatenates the output files from each jobs
+            _spk_list=" "
+            for i in $(seq ${spk_num}); do
+                _spk_list+="spk${i} "
+            done
+
+            for spk in ${_spk_list}; do
+                for i in $(seq "${_nj}"); do
+                    cat "${_logdir}/output.${i}/${spk}.scp"
+                done | LC_ALL=C sort -k1 > "${_dir}/${spk}.scp"
+            done
+        done
+    fi
+
+    if [ ${stage} -le 14 ] && [ ${stop_stage} -ge 14 ]; then
+        log "Stage 14: Scoring ASR"
+        if [ "${token_type}" = phn ]; then
+            log "Error: Not implemented for token_type=phn"
+            exit 1
+        fi
+
+        for dset in ${test_sets}; do
+            _data="${data_feats}/${dset}"
+            _dir="${enh_asr_exp}/${inference_tag}/${dset}"
+
+            for _type in cer wer ter; do
+                [ "${_type}" = ter ] && [ ! -f "${bpemodel}" ] && continue
+
+                _scoredir="${_dir}/score_${_type}"
+                mkdir -p "${_scoredir}"
+
+                if [ "${_type}" = wer ]; then
+                    # Tokenize text to word level
+                    paste \
+                        <(<"${_data}/text" \
+                              ${python} -m espnet2.bin.tokenize_text  \
+                                  -f 2- --input - --output - \
+                                  --token_type word \
+                                  --non_linguistic_symbols "${nlsyms_txt}" \
+                                  --remove_non_linguistic_symbols true \
+                                  --cleaner "${cleaner}" \
+                                  ) \
+                        <(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \
+                            >"${_scoredir}/ref.trn"
+
+                    # NOTE(kamo): Don't use cleaner for hyp
+                    paste \
+                        <(<"${_dir}/text"  \
+                              ${python} -m espnet2.bin.tokenize_text  \
+                                  -f 2- --input - --output - \
+                                  --token_type word \
+                                  --non_linguistic_symbols "${nlsyms_txt}" \
+                                  --remove_non_linguistic_symbols true \
+                                  ) \
+                        <(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \
+                            >"${_scoredir}/hyp.trn"
+
+
+                elif [ "${_type}" = cer ]; then
+                    # Tokenize text to char level
+                    paste \
+                        <(<"${_data}/text" \
+                              ${python} -m espnet2.bin.tokenize_text  \
+                                  -f 2- --input - --output - \
+                                  --token_type char \
+                                  --non_linguistic_symbols "${nlsyms_txt}" \
+                                  --remove_non_linguistic_symbols true \
+                                  --cleaner "${cleaner}" \
+                                  ) \
+                        <(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \
+                            >"${_scoredir}/ref.trn"
+
+                    # NOTE(kamo): Don't use cleaner for hyp
+                    paste \
+                        <(<"${_dir}/text"  \
+                              ${python} -m espnet2.bin.tokenize_text  \
+                                  -f 2- --input - --output - \
+                                  --token_type char \
+                                  --non_linguistic_symbols "${nlsyms_txt}" \
+                                  --remove_non_linguistic_symbols true \
+                                  ) \
+                        <(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \
+                            >"${_scoredir}/hyp.trn"
+
+                elif [ "${_type}" = ter ]; then
+                    # Tokenize text using BPE
+                    paste \
+                        <(<"${_data}/text" \
+                              ${python} -m espnet2.bin.tokenize_text  \
+                                  -f 2- --input - --output - \
+                                  --token_type bpe \
+                                  --bpemodel "${bpemodel}" \
+                                  --cleaner "${cleaner}" \
+                                ) \
+                        <(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \
+                            >"${_scoredir}/ref.trn"
+
+                    # NOTE(kamo): Don't use cleaner for hyp
+                    paste \
+                        <(<"${_dir}/text" \
+                              ${python} -m espnet2.bin.tokenize_text  \
+                                  -f 2- --input - --output - \
+                                  --token_type bpe \
+                                  --bpemodel "${bpemodel}" \
+                                  ) \
+                        <(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \
+                            >"${_scoredir}/hyp.trn"
+
+                fi
+
+                sclite \
+            ${score_opts} \
+                    -r "${_scoredir}/ref.trn" trn \
+                    -h "${_scoredir}/hyp.trn" trn \
+                    -i rm -o all stdout > "${_scoredir}/result.txt"
+
+                log "Write ${_type} result in ${_scoredir}/result.txt"
+                grep -e Avg -e SPKR -m 2 "${_scoredir}/result.txt"
+            done
+        done
+
+        [ -f local/score.sh ] && local/score.sh ${local_score_opts} "${enh_asr_exp}"
+
+        # Show results in Markdown syntax
+        scripts/utils/show_asr_result.sh "${enh_asr_exp}" > "${enh_asr_exp}"/RESULTS.md
+        cat "${enh_asr_exp}"/RESULTS.md
+
+    fi
+
+    if [ ${stage} -le 15 ] && [ ${stop_stage} -ge 15 ]; then
+        log "Stage 15: Scoring Enhancement"
+        _cmd=${decode_cmd}
+
+        # score_obs=true: Scoring for observation signal
+        # score_obs=false: Scoring for enhanced signal
+        # for score_obs in true false; do
+        for score_obs in true false; do
+            # Peform only at the first time for observation
+            if "${score_obs}" && [ -e "${data_feats}/RESULTS_enh.md" ]; then
+                log "${data_feats}/RESULTS_enh.md already exists. The scoring for observation will be skipped"
+                continue
+            fi
+
+            for dset in ${test_sets}; do
+                _data="${data_feats}/${dset}"
+                if "${score_obs}"; then
+                    _dir="${data_feats}/${dset}/scoring"
+                else
+                    _dir="${enh_asr_exp}/${inference_enh_tag}_${dset}/scoring"
+                fi
+
+                _logdir="${_dir}/logdir"
+                mkdir -p "${_logdir}"
+
+                # 1. Split the key file
+                key_file=${_data}/wav.scp
+                split_scps=""
+                _nj=$(min "${inference_nj}" "$(<${key_file} wc -l)")
+                for n in $(seq "${_nj}"); do
+                    split_scps+=" ${_logdir}/keys.${n}.scp"
+                done
+                # shellcheck disable=SC2086
+                utils/split_scp.pl "${key_file}" ${split_scps}
+
+                _ref_scp=
+                for spk in $(seq "${spk_num}"); do
+                    _ref_scp+="--ref_scp ${_data}/spk${spk}.scp "
+                done
+                _inf_scp=
+                for spk in $(seq "${spk_num}"); do
+                    if "${score_obs}"; then
+                        # To compute the score of observation, input original wav.scp
+                        _inf_scp+="--inf_scp ${data_feats}/${dset}/wav.scp "
+                    else
+                        _inf_scp+="--inf_scp ${enh_asr_exp}/${inference_enh_tag}_${dset}/spk${spk}.scp "
+                    fi
+                done
+
+                # 2. Submit scoring jobs
+                log "Scoring started... log: '${_logdir}/enh_scoring.*.log'"
+                # shellcheck disable=SC2086
+                ${_cmd} JOB=1:"${_nj}" "${_logdir}"/enh_scoring.JOB.log \
+                    ${python} -m espnet2.bin.enh_scoring \
+                        --key_file "${_logdir}"/keys.JOB.scp \
+                        --output_dir "${_logdir}"/output.JOB \
+                        ${_ref_scp} \
+                        ${_inf_scp} \
+                        --ref_channel ${ref_channel}
+
+                for spk in $(seq "${spk_num}"); do
+                    for protocol in ${scoring_protocol} wav; do
+                        for i in $(seq "${_nj}"); do
+                            cat "${_logdir}/output.${i}/${protocol}_spk${spk}"
+                        done | LC_ALL=C sort -k1 > "${_dir}/${protocol}_spk${spk}"
+                    done
+                done
+
+
+                for protocol in ${scoring_protocol}; do
+                    # shellcheck disable=SC2046
+                    paste $(for j in $(seq ${spk_num}); do echo "${_dir}"/"${protocol}"_spk"${j}" ; done)  |
+                    awk 'BEGIN{sum=0}
+                        {n=0;score=0;for (i=2; i<=NF; i+=2){n+=1;score+=$i}; sum+=score/n}
+                        END{printf ("%.2f\n",sum/NR)}' > "${_dir}/result_${protocol,,}.txt"
+                done
+            done
+
+            ./scripts/utils/show_enh_score.sh "${_dir}/../.." > "${_dir}/../../RESULTS_enh.md"
+        done
+        log "Evaluation result for observation: ${data_feats}/RESULTS_enh.md"
+        log "Evaluation result for enhancement: ${enh_asr_exp}/RESULTS_enh.md"
+
+    fi
+else
+    log "Skip the evaluation stages"
+fi
+
+
+packed_model="${enh_asr_exp}/${enh_asr_exp##*/}_${inference_enh_asr_model%.*}.zip"
+if ! "${skip_upload_hf}"; then
+    if [ ${stage} -le 16 ] && [ ${stop_stage} -ge 16 ]; then
+        log "Stage 16: Pack model: ${packed_model}"
+
+        _opts=
+        if "${use_lm}"; then
+            _opts+="--lm_train_config ${lm_exp}/config.yaml "
+            _opts+="--lm_file ${lm_exp}/${inference_lm} "
+            _opts+="--option ${lm_exp}/perplexity_test/ppl "
+            _opts+="--option ${lm_exp}/images "
+        fi
+        if [ "${feats_normalize}" = global_mvn ]; then
+            _opts+="--option ${enh_asr_stats_dir}/train/feats_stats.npz "
+        fi
+        if [ "${token_type}" = bpe ]; then
+            _opts+="--option ${bpemodel} "
+        fi
+        if [ "${nlsyms_txt}" != none ]; then
+            _opts+="--option ${nlsyms_txt} "
+        fi
+        # shellcheck disable=SC2086
+        ${python} -m espnet2.bin.pack enh_s2t \
+            --enh_s2t_train_config "${enh_asr_exp}"/config.yaml \
+            --enh_s2t_model_file "${enh_asr_exp}"/"${inference_enh_asr_model}" \
+            ${_opts} \
+            --option "${enh_asr_exp}"/RESULTS.md \
+            --option "${enh_asr_exp}"/RESULTS_enh.md \
+            --option "${enh_asr_exp}"/images \
+            --outpath "${packed_model}"
+    fi
+
+
+    if [ ${stage} -le 17 ] && [ ${stop_stage} -ge 17 ]; then
+        [ -z "${hf_repo}" ] && \
+            log "ERROR: You need to setup the variable hf_repo with the name of the repository located at HuggingFace" && \
+            exit 1
+        log "Stage 17: Upload model to HuggingFace: ${hf_repo}"
+
+        gitlfs=$(git lfs --version 2> /dev/null || true)
+        [ -z "${gitlfs}" ] && \
+            log "ERROR: You need to install git-lfs first" && \
+            exit 1
+
+        dir_repo=${expdir}/hf_${hf_repo//"/"/"_"}
+        [ ! -d "${dir_repo}" ] && git clone https://huggingface.co/${hf_repo} ${dir_repo}
+
+        if command -v git &> /dev/null; then
+            _creator_name="$(git config user.name)"
+            _checkout="git checkout $(git show -s --format=%H)"
+        else
+            _creator_name="$(whoami)"
+            _checkout=""
+        fi
+        # /some/where/espnet/egs2/foo/asr1/ -> foo/asr1
+        _task="$(pwd | rev | cut -d/ -f2 | rev)"
+        # foo/asr1 -> foo
+        _corpus="${_task%/*}"
+        _model_name="${_creator_name}/${_corpus}_$(basename ${packed_model} .zip)"
+
+        # copy files in ${dir_repo}
+        unzip -o ${packed_model} -d ${dir_repo}
+        # Generate description file
+        # shellcheck disable=SC2034
+        hf_task=speech-enhancement-recognition
+        # shellcheck disable=SC2034     
+        espnet_task=EnhS2T
+        # shellcheck disable=SC2034
+        task_exp=${enh_asr_exp}
+        eval "echo \"$(cat scripts/utils/TEMPLATE_HF_Readme.md)\"" > "${dir_repo}"/README.md
+
+        this_folder=${PWD}
+        cd ${dir_repo}
+        if [ -n "$(git status --porcelain)" ]; then
+            git add .
+            git commit -m "Update model"
+        fi
+        git push
+        cd ${this_folder}
+    fi
+else
+    log "Skip the uploading stages"
+fi
+
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/TEMPLATE/enh_asr1/local/path.sh b/egs2/TEMPLATE/enh_asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/TEMPLATE/enh_asr1/path.sh b/egs2/TEMPLATE/enh_asr1/path.sh
new file mode 100755
index 00000000000..d2b90a67653
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/path.sh
@@ -0,0 +1,22 @@
+MAIN_ROOT=$PWD/../../..
+KALDI_ROOT=$MAIN_ROOT/tools/kaldi
+
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+
+. "${MAIN_ROOT}"/tools/activate_python.sh && . "${MAIN_ROOT}"/tools/extra_path.sh
+export PATH=$MAIN_ROOT/utils:$MAIN_ROOT/espnet/bin:$PATH
+
+export OMP_NUM_THREADS=1
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+
+# You need to change or unset NCCL_SOCKET_IFNAME according to your network environment
+# https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/env.html#nccl-socket-ifname
+export NCCL_SOCKET_IFNAME="^lo,docker,virbr,vmnet,vboxnet"
+
+# NOTE(kamo): Source at the last to overwrite the setting
+. local/path.sh
diff --git a/egs2/TEMPLATE/enh_asr1/pyscripts b/egs2/TEMPLATE/enh_asr1/pyscripts
new file mode 120000
index 00000000000..90e7cf60b04
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/pyscripts
@@ -0,0 +1 @@
+../asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_asr1/scripts/audio b/egs2/TEMPLATE/enh_asr1/scripts/audio
new file mode 120000
index 00000000000..836e57dcd1d
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/scripts/audio
@@ -0,0 +1 @@
+../../enh1/scripts/audio
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_asr1/scripts/feats b/egs2/TEMPLATE/enh_asr1/scripts/feats
new file mode 120000
index 00000000000..8b492e66782
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/scripts/feats
@@ -0,0 +1 @@
+../../asr1/scripts/feats
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_asr1/scripts/utils/TEMPLATE_HF_Readme.md b/egs2/TEMPLATE/enh_asr1/scripts/utils/TEMPLATE_HF_Readme.md
new file mode 120000
index 00000000000..137c5c9044a
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/scripts/utils/TEMPLATE_HF_Readme.md
@@ -0,0 +1 @@
+../../../asr1/scripts/utils/TEMPLATE_HF_Readme.md
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_asr1/scripts/utils/TEMPLATE_Readme.md b/egs2/TEMPLATE/enh_asr1/scripts/utils/TEMPLATE_Readme.md
new file mode 120000
index 00000000000..3479c7ee724
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/scripts/utils/TEMPLATE_Readme.md
@@ -0,0 +1 @@
+../../../asr1/scripts/utils/TEMPLATE_Readme.md
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_asr1/scripts/utils/create_README_file.py b/egs2/TEMPLATE/enh_asr1/scripts/utils/create_README_file.py
new file mode 120000
index 00000000000..0fe3405603d
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/scripts/utils/create_README_file.py
@@ -0,0 +1 @@
+../../../asr1/scripts/utils/create_README_file.py
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_asr1/scripts/utils/download_from_google_drive.sh b/egs2/TEMPLATE/enh_asr1/scripts/utils/download_from_google_drive.sh
new file mode 120000
index 00000000000..b3c560c573c
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/scripts/utils/download_from_google_drive.sh
@@ -0,0 +1 @@
+../../../../../utils/download_from_google_drive.sh
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_asr1/scripts/utils/evaluate_asr.sh b/egs2/TEMPLATE/enh_asr1/scripts/utils/evaluate_asr.sh
new file mode 120000
index 00000000000..cccf5bf788b
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/scripts/utils/evaluate_asr.sh
@@ -0,0 +1 @@
+../../../asr1/scripts/utils/evaluate_asr.sh
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_asr1/scripts/utils/get_model_names.py b/egs2/TEMPLATE/enh_asr1/scripts/utils/get_model_names.py
new file mode 120000
index 00000000000..b163314a6c5
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/scripts/utils/get_model_names.py
@@ -0,0 +1 @@
+../../../asr1/scripts/utils/get_model_names.py
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_asr1/scripts/utils/perturb_enh_data_dir_speed.sh b/egs2/TEMPLATE/enh_asr1/scripts/utils/perturb_enh_data_dir_speed.sh
new file mode 120000
index 00000000000..0896188f3a1
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/scripts/utils/perturb_enh_data_dir_speed.sh
@@ -0,0 +1 @@
+../../../enh1/scripts/utils/perturb_enh_data_dir_speed.sh
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_asr1/scripts/utils/show_asr_result.sh b/egs2/TEMPLATE/enh_asr1/scripts/utils/show_asr_result.sh
new file mode 120000
index 00000000000..ea34b243f2c
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/scripts/utils/show_asr_result.sh
@@ -0,0 +1 @@
+../../../asr1/scripts/utils/show_asr_result.sh
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_asr1/scripts/utils/show_enh_score.sh b/egs2/TEMPLATE/enh_asr1/scripts/utils/show_enh_score.sh
new file mode 100755
index 00000000000..e135d73f91f
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/scripts/utils/show_enh_score.sh
@@ -0,0 +1,84 @@
+#!/usr/bin/env bash
+mindepth=0
+maxdepth=1
+
+. utils/parse_options.sh
+
+if [ $# -gt 1 ]; then
+    echo "Usage: $0 --mindepth 0 --maxdepth 1 [exp]" 1>&2
+    echo ""
+    echo "Show the system environments and the evaluation results in Markdown format."
+    echo 'The default of <exp> is "exp/".'
+    exit 1
+fi
+
+[ -f ./path.sh ] && . ./path.sh
+set -euo pipefail
+if [ $# -eq 1 ]; then
+    exp=$(realpath "$1")
+else
+    exp=exp
+fi
+
+
+cat << EOF
+<!-- Generated by $0 -->
+# RESULTS
+## Environments
+- date: \`$(LC_ALL=C date)\`
+EOF
+
+python3 << EOF
+import sys, espnet, torch
+pyversion = sys.version.replace('\n', ' ')
+
+print(f"""- python version: \`{pyversion}\`
+- espnet version: \`espnet {espnet.__version__}\`
+- pytorch version: \`pytorch {torch.__version__}\`""")
+EOF
+
+cat << EOF
+- Git hash: \`$(git rev-parse HEAD)\`
+  - Commit date: \`$(git log -1 --format='%cd')\`
+
+EOF
+
+
+while IFS= read -r expdir; do
+    if ls "${expdir}"/*/scoring_enh/result_stoi.txt &> /dev/null; then
+        echo -e "\n## $(basename ${expdir})\n"
+        [ -e "${expdir}"/config.yaml ] && grep ^config "${expdir}"/config.yaml
+        metrics=()
+        heading="\n|dataset|"
+        sep="|---|"
+        for type in pesq estoi stoi sar sdr sir si_snr; do
+            if ls "${expdir}"/*/scoring_enh/result_${type}.txt &> /dev/null; then
+                metrics+=("$type")
+                heading+="${type^^}|"
+                sep+="---|"
+            fi
+        done
+        echo -e "${heading}\n${sep}"
+
+        setnames=()
+        for dirname in "${expdir}"/*/scoring_enh/result_stoi.txt; do
+            dset=$(echo $dirname | sed -e "s#${expdir}/\([^/]*\)/scoring_enh/result_stoi.txt#\1#g")
+            setnames+=("$dset")
+        done
+        for dset in "${setnames[@]}"; do
+            line="|${dset}|"
+            for ((i=0; i<${#metrics[@]}; i++)); do
+                type=${metrics[$i]}
+                if [ -f "${expdir}"/${dset}/scoring_enh/result_${type}.txt ]; then
+                    score=$(head -n1 "${expdir}"/${dset}/scoring_enh/result_${type}.txt)
+                else
+                    score=""
+                fi
+                line+="${score}|"
+            done
+            echo $line
+        done
+        echo ""
+    fi
+
+done < <(find ${exp} -mindepth ${mindepth} -maxdepth ${maxdepth} -type d)
diff --git a/egs2/TEMPLATE/enh_asr1/scripts/utils/upload_models_to_hub.sh b/egs2/TEMPLATE/enh_asr1/scripts/utils/upload_models_to_hub.sh
new file mode 120000
index 00000000000..aeae4732e4b
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/scripts/utils/upload_models_to_hub.sh
@@ -0,0 +1 @@
+../../../asr1/scripts/utils/upload_models_to_hub.sh
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_asr1/setup.sh b/egs2/TEMPLATE/enh_asr1/setup.sh
new file mode 100755
index 00000000000..36799ce4a13
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/setup.sh
@@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+help_message=$(cat << EOF
+Usage: $0 <target-dir>
+EOF
+)
+
+
+if [ $# -ne 1 ]; then
+    log "${help_message}"
+    log "Error: 1 positional argument is required."
+    exit 2
+fi
+
+
+dir=$1
+mkdir -p "${dir}"
+
+if [ ! -d "${dir}"/../../TEMPLATE ]; then
+    log "Error: ${dir}/../../TEMPLATE should exist. You may specify wrong directory."
+    exit 1
+fi
+
+targets=""
+
+# Copy
+for f in cmd.sh conf local; do
+    target="${dir}"/../../TEMPLATE/enh_asr1/"${f}"
+    cp -r "${target}" "${dir}"
+    targets+="${dir}/${target} "
+done
+
+
+# Symlinks to TEMPLATE
+for f in enh_asr.sh path.sh db.sh scripts pyscripts; do
+    target=../../TEMPLATE/enh_asr1/"${f}"
+    ln -sf "${target}" "${dir}"
+    targets+="${dir}/${target} "
+done
+
+
+# Symlinks to Kaldi
+for f in steps utils; do
+    target=../../../tools/kaldi/egs/wsj/s5/"${f}"
+    ln -sf "${target}" "${dir}"
+    targets+="${dir}/${target} "
+done
+
+log "Created: ${targets}"
diff --git a/egs2/TEMPLATE/enh_asr1/steps b/egs2/TEMPLATE/enh_asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_asr1/utils b/egs2/TEMPLATE/enh_asr1/utils
new file mode 120000
index 00000000000..6d93948f170
--- /dev/null
+++ b/egs2/TEMPLATE/enh_asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils/
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_st1/cmd.sh b/egs2/TEMPLATE/enh_st1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/TEMPLATE/enh_st1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/TEMPLATE/enh_st1/conf/fbank.conf b/egs2/TEMPLATE/enh_st1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/TEMPLATE/enh_st1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/TEMPLATE/enh_st1/conf/pbs.conf b/egs2/TEMPLATE/enh_st1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/TEMPLATE/enh_st1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/TEMPLATE/enh_st1/conf/pitch.conf b/egs2/TEMPLATE/enh_st1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/TEMPLATE/enh_st1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/TEMPLATE/enh_st1/conf/queue.conf b/egs2/TEMPLATE/enh_st1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/TEMPLATE/enh_st1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/TEMPLATE/enh_st1/conf/slurm.conf b/egs2/TEMPLATE/enh_st1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/TEMPLATE/enh_st1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/TEMPLATE/enh_st1/db.sh b/egs2/TEMPLATE/enh_st1/db.sh
new file mode 120000
index 00000000000..318d781d123
--- /dev/null
+++ b/egs2/TEMPLATE/enh_st1/db.sh
@@ -0,0 +1 @@
+../asr1/db.sh
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_st1/enh_st.sh b/egs2/TEMPLATE/enh_st1/enh_st.sh
new file mode 100755
index 00000000000..eabf49cc29d
--- /dev/null
+++ b/egs2/TEMPLATE/enh_st1/enh_st.sh
@@ -0,0 +1,1819 @@
+#!/usr/bin/env bash
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+min() {
+  local a b
+  a=$1
+  for b in "$@"; do
+      if [ "${b}" -le "${a}" ]; then
+          a="${b}"
+      fi
+  done
+  echo "${a}"
+}
+SECONDS=0
+
+# General configuration
+stage=1              # Processes starts from the specified stage.
+stop_stage=10000     # Processes is stopped at the specified stage.
+skip_data_prep=false # Skip data preparation stages.
+skip_train=false     # Skip training stages.
+skip_eval=false      # Skip decoding and evaluation stages.
+skip_upload_hf=true  # Skip uploading to hugging face stages.
+ngpu=1               # The number of gpus ("0" uses cpu, otherwise use gpu).
+num_nodes=1          # The number of nodes.
+nj=32                # The number of parallel jobs.
+inference_nj=32      # The number of parallel jobs in decoding.
+gpu_inference=false  # Whether to perform gpu decoding.
+dumpdir=dump         # Directory to dump features.
+expdir=exp           # Directory to save experiments.
+python=python3       # Specify python to execute espnet commands.
+
+# Data preparation related
+local_data_opts= # The options given to local/data.sh.
+
+# Speed perturbation related
+speed_perturb_factors=  # perturbation factors, e.g. "0.9 1.0 1.1" (separated by space).
+
+# Feature extraction related
+feats_type=raw       # Feature type (raw or fbank_pitch).
+audio_format=flac    # Audio format: wav, flac, wav.ark, flac.ark  (only in feats_type=raw).
+fs=16k               # Sampling rate.
+min_wav_duration=0.1 # Minimum duration in second.
+max_wav_duration=20  # Maximum duration in second.
+
+# Tokenization related
+oov="<unk>"         # Out of vocabulary symbol.
+blank="<blank>"     # CTC blank symbol
+sos_eos="<sos/eos>" # sos and eos symbole
+token_joint=false       # whether to use a single bpe system for both source and target languages
+src_case=lc.rm
+src_token_type=bpe      # Tokenization type (char or bpe) for source languages.
+src_nbpe=30             # The number of BPE vocabulary for source language.
+src_bpemode=unigram     # Mode of BPE for source language (unigram or bpe).
+src_bpe_input_sentence_size=100000000 # Size of input sentence for BPE for source language.
+src_bpe_nlsyms=         # non-linguistic symbols list, separated by a comma, for BPE of source language
+src_bpe_char_cover=1.0  # character coverage when modeling BPE for source language
+tgt_case=tc
+tgt_token_type=bpe      # Tokenization type (char or bpe) for target language.
+tgt_nbpe=30             # The number of BPE vocabulary for target language.
+tgt_bpemode=unigram     # Mode of BPE (unigram or bpe) for target language.
+tgt_bpe_input_sentence_size=100000000 # Size of input sentence for BPE for target language.
+tgt_bpe_nlsyms=         # non-linguistic symbols list, separated by a comma, for BPE for target language.
+tgt_bpe_char_cover=1.0  # character coverage when modeling BPE for target language.
+
+# Ngram model related
+use_ngram=false
+ngram_exp=
+ngram_num=3
+
+# Language model related
+use_lm=true       # Use language model for ST decoding.
+lm_tag=           # Suffix to the result dir for language model training.
+lm_exp=           # Specify the directory path for LM experiment.
+                  # If this option is specified, lm_tag is ignored.
+lm_stats_dir=     # Specify the directory path for LM statistics.
+lm_config=        # Config for language model training.
+lm_args=          # Arguments for language model training, e.g., "--max_epoch 10".
+                  # Note that it will overwrite args in lm config.
+use_word_lm=false # Whether to use word language model.
+num_splits_lm=1   # Number of splitting for lm corpus.
+# shellcheck disable=SC2034
+word_vocab_size=10000 # Size of word vocabulary.
+
+# ST model related
+enh_st_tag=        # Suffix to the result dir for st model training.
+enh_st_exp=        # Specify the directory path for ST experiment.
+               # If this option is specified, enh_st_tag is ignored.
+enh_st_stats_dir=  # Specify the directory path for ST statistics.
+enh_st_config=     # Config for st model training.
+enh_st_args=       # Arguments for st model training, e.g., "--max_epoch 10".
+                   # Note that it will overwrite args in st config.
+pretrained_model=          # Pretrained model to load
+ignore_init_mismatch=false # Ignore initial mismatch
+feats_normalize=global_mvn # Normalizaton layer type.
+num_splits_st=1            # Number of splitting for lm corpus.
+src_lang=es                # source language abbrev. id (e.g., es)
+tgt_lang=en                # target language abbrev. id (e.g., en)
+
+# Upload model related
+hf_repo=
+
+# Decoding related
+use_k2=false        # Whether to use k2 based decoder
+batch_size=1
+inference_tag=      # Suffix to the result dir for decoding.
+inference_config=   # Config for decoding.
+st_inference_args=  # Arguments for decoding, e.g., "--lm_weight 0.1".
+                    # Note that it will overwrite args in inference config.
+enh_inference_args="--normalize_output_wav true"
+inference_lm=valid.loss.ave.pth       # Language model path for decoding.
+inference_ngram=${ngram_num}gram.bin
+inference_enh_st_model=valid.acc.ave.pth # ST model path for decoding.
+                                      # e.g.
+                                      # inference_enh_st_model=train.loss.best.pth
+                                      # inference_enh_st_model=3epoch.pth
+                                      # inference_enh_st_model=valid.acc.best.pth
+                                      # inference_enh_st_model=valid.loss.ave.pth
+download_model= # Download a model from Model Zoo and use it for decoding.
+
+# Enhancement related arguments
+spk_num=1   # Number of speakers
+noise_type_num=1
+dereverb_ref_num=1
+# Evaluation related
+enh_inference_args="--normalize_output_wav true"
+scoring_protocol="STOI SDR SAR SIR SI_SNR"
+ref_channel=0
+inference_enh_tag=      # Prefix to the result dir for ENH inference.
+inference_enh_config=   # Config for enhancement.
+
+# Enh Training data related
+use_dereverb_ref=false
+use_noise_ref=false
+
+# [Task dependent] Set the datadir name created by local/data.sh
+train_set=       # Name of training set.
+valid_set=       # Name of validation set used for monitoring/tuning network training.
+test_sets=       # Names of test sets. Multiple items (e.g., both dev and eval sets) can be specified.
+src_bpe_train_text=  # Text file path of bpe training set for source language.
+tgt_bpe_train_text=  # Text file path of bpe training set for target language.
+lm_train_text=   # Text file path of language model training set.
+lm_dev_text=     # Text file path of language model development set.
+lm_test_text=    # Text file path of language model evaluation set.
+nlsyms_txt=none  # Non-linguistic symbol list if existing.
+cleaner=none     # Text cleaner.
+g2p=none         # g2p method (needed if token_type=phn).
+lang=noinfo      # The language type of corpus.
+enh_st_speech_fold_length=800 # fold_length for speech data during ST training.
+enh_st_text_fold_length=150   # fold_length for text data during ST training.
+lm_fold_length=150         # fold_length for LM training.
+
+help_message=$(cat << EOF
+Usage: $0 --train-set "<train_set_name>" --valid-set "<valid_set_name>" --test_sets "<test_set_names>"
+
+Options:
+    # General configuration
+    --stage          # Processes starts from the specified stage (default="${stage}").
+    --stop_stage     # Processes is stopped at the specified stage (default="${stop_stage}").
+    --skip_data_prep # Skip data preparation stages (default="${skip_data_prep}").
+    --skip_train     # Skip training stages (default="${skip_train}").
+    --skip_eval      # Skip decoding and evaluation stages (default="${skip_eval}").
+    --skip_upload_hf    # Skip packing and uploading stages (default="${skip_upload_hf}").
+    --ngpu           # The number of gpus ("0" uses cpu, otherwise use gpu, default="${ngpu}").
+    --num_nodes      # The number of nodes (default="${num_nodes}").
+    --nj             # The number of parallel jobs (default="${nj}").
+    --inference_nj   # The number of parallel jobs in decoding (default="${inference_nj}").
+    --gpu_inference  # Whether to perform gpu decoding (default="${gpu_inference}").
+    --dumpdir        # Directory to dump features (default="${dumpdir}").
+    --expdir         # Directory to save experiments (default="${expdir}").
+    --python         # Specify python to execute espnet commands (default="${python}").
+
+    # Data preparation related
+    --local_data_opts # The options given to local/data.sh (default="${local_data_opts}").
+
+    # Speed perturbation related
+    --speed_perturb_factors # speed perturbation factors, e.g. "0.9 1.0 1.1" (separated by space, default="${speed_perturb_factors}").
+
+    # Feature extraction related
+    --feats_type       # Feature type (raw, fbank_pitch or extracted, default="${feats_type}").
+    --audio_format     # Audio format: wav, flac, wav.ark, flac.ark  (only in feats_type=raw, default="${audio_format}").
+    --fs               # Sampling rate (default="${fs}").
+    --min_wav_duration # Minimum duration in second (default="${min_wav_duration}").
+    --max_wav_duration # Maximum duration in second (default="${max_wav_duration}").
+
+    # Tokenization related
+    --oov                     # Out of vocabulary symbol (default="${oov}").
+    --blank                   # CTC blank symbol (default="${blank}").
+    --sos_eos                 # sos and eos symbole (default="${sos_eos}").
+    --token_joint=false       # Whether to use a single bpe system for both source and target languages.
+                              # if set as true, will use tgt_* for processing (default="${token_joint}").
+    --src_token_type=bpe      # Tokenization type (char or bpe) for source languages. (default="${src_token_type}").
+    --src_nbpe=30             # The number of BPE vocabulary for source language. (default="${src_nbpe}").
+    --src_bpemode=unigram     # Mode of BPE for source language (unigram or bpe). (default="${src_bpemode}").
+    --src_bpe_input_sentence_size=100000000 # Size of input sentence for BPE for source language. (default="${src_bpe_input_sentence_size}").
+    --src_bpe_nlsyms=         # Non-linguistic symbols list, separated by a comma, for BPE of source language. (default="${src_bpe_nlsyms}").
+    --src_bpe_char_cover=1.0  # Character coverage when modeling BPE for source language. (default="${src_bpe_char_cover}").
+    --tgt_token_type=bpe      # Tokenization type (char or bpe) for target language. (default="${tgt_token_type}").
+    --tgt_nbpe=30             # The number of BPE vocabulary for target language. (default="${tgt_nbpe}").
+    --tgt_bpemode=unigram     # Mode of BPE (unigram or bpe) for target language. (default="${tgt_bpemode}").
+    --tgt_bpe_input_sentence_size=100000000 # Size of input sentence for BPE for target language. (default="${tgt_bpe_input_sentence_size}").
+    --tgt_bpe_nlsyms=         # Non-linguistic symbols list, separated by a comma, for BPE for target language. (default="${tgt_bpe_nlsyms}").
+    --tgt_bpe_char_cover=1.0  # Character coverage when modeling BPE for target language. (default="${tgt_bpe_char_cover}").
+
+    # Language model related
+    --lm_tag          # Suffix to the result dir for language model training (default="${lm_tag}").
+    --lm_exp          # Specify the directory path for LM experiment.
+                      # If this option is specified, lm_tag is ignored (default="${lm_exp}").
+    --lm_stats_dir    # Specify the directory path for LM statistics (default="${lm_stats_dir}").
+    --lm_config       # Config for language model training (default="${lm_config}").
+    --lm_args         # Arguments for language model training (default="${lm_args}").
+                      # e.g., --lm_args "--max_epoch 10"
+                      # Note that it will overwrite args in lm config.
+    --use_word_lm     # Whether to use word language model (default="${use_word_lm}").
+    --word_vocab_size # Size of word vocabulary (default="${word_vocab_size}").
+    --num_splits_lm   # Number of splitting for lm corpus (default="${num_splits_lm}").
+
+    # ST model related
+    --enh_st_tag           # Suffix to the result dir for st model training (default="${enh_st_tag}").
+    --enh_st_exp           # Specify the directory path for ST experiment.
+                       # If this option is specified, enh_st_tag is ignored (default="${enh_st_exp}").
+    --enh_st_stats_dir     # Specify the directory path for ST statistics (default="${enh_st_stats_dir}").
+    --enh_st_config        # Config for st model training (default="${enh_st_config}").
+    --enh_st_args          # Arguments for st model training (default="${enh_st_args}").
+                           # e.g., --enh_st_args "--max_epoch 10"
+                           # Note that it will overwrite args in st config.
+    --pretrained_model=          # Pretrained model to load (default="${pretrained_model}").
+    --ignore_init_mismatch=      # Ignore mismatch parameter init with pretrained model (default="${ignore_init_mismatch}").
+    --feats_normalize  # Normalizaton layer type. (default="${feats_normalize}").
+    --num_splits_st    # Number of splitting for lm corpus.  (default="${num_splits_st}").
+    --src_lang=        # source language abbrev. id (e.g., es). (default="${src_lang}")
+    --tgt_lang=        # target language abbrev. id (e.g., en). (default="${tgt_lang}")
+
+    # Decoding related
+    --inference_tag       # Suffix to the result dir for decoding (default="${inference_tag}").
+    --inference_config    # Config for decoding (default="${inference_config}").
+    --st_inference_args   # Arguments for decoding (default="${st_inference_args}").
+                          # e.g., --st_inference_args "--lm_weight 0.1"
+                          # Note that it will overwrite args in inference config.
+    --enh_inference_args     # Arguments for enhancement (default="${enh_inference_args}").
+    --inference_lm        # Language model path for decoding (default="${inference_lm}").
+    --inference_enh_st_model # ST model path for decoding (default="${inference_enh_st_model}").
+    --download_model      # Download a model from Model Zoo and use it for decoding (default="${download_model}").
+
+    --spk_num             # number of speakers
+    --noise_type_num   # Number of noise types in the input audio (default="${noise_type_num}")
+    --dereverb_ref_num # Number of references for dereverberation (default="${dereverb_ref_num}")
+    --use_dereverb_ref # Whether or not to use dereverberated signal as an additional reference
+                         for training a dereverberation model (default="${use_dereverb_ref}")
+    --use_noise_ref    # Whether or not to use noise signal as an additional reference
+                         for training a denoising model (default="${use_noise_ref}")
+    # Enhancement Evaluation related
+    --scoring_protocol    # Metrics to be used for scoring (default="${scoring_protocol}")
+    --ref_channel         # Reference channel of the reference speech will be used if the model
+                            output is single-channel and reference speech is multi-channel
+                            (default="${ref_channel}")
+    # [Task dependent] Set the datadir name created by local/data.sh
+    --train_set     # Name of training set (required).
+    --valid_set     # Name of validation set used for monitoring/tuning network training (required).
+    --test_sets     # Names of test sets.
+                    # Multiple items (e.g., both dev and eval sets) can be specified (required).
+    --src_bpe_train_text # Text file path of bpe training set for source language.
+    --tgt_bpe_train_text # Text file path of bpe training set for target language
+    --lm_train_text  # Text file path of language model training set.
+    --lm_dev_text   # Text file path of language model development set (default="${lm_dev_text}").
+    --lm_test_text  # Text file path of language model evaluation set (default="${lm_test_text}").
+    --nlsyms_txt    # Non-linguistic symbol list if existing (default="${nlsyms_txt}").
+    --cleaner       # Text cleaner (default="${cleaner}").
+    --g2p           # g2p method (default="${g2p}").
+    --lang          # The language type of corpus (default=${lang}).
+    --score_opts             # The options given to sclite scoring (default="{score_opts}").
+    --local_score_opts       # The options given to local/score.sh (default="{local_score_opts}").
+    --enh_st_speech_fold_length # fold_length for speech data during ST training (default="${enh_st_speech_fold_length}").
+    --enh_st_text_fold_length   # fold_length for text data during ST training (default="${enh_st_text_fold_length}").
+    --lm_fold_length         # fold_length for LM training (default="${lm_fold_length}").
+EOF
+)
+
+log "$0 $*"
+# Save command line args for logging (they will be lost after utils/parse_options.sh)
+run_args=$(pyscripts/utils/print_args.py $0 "$@")
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    log "${help_message}"
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+. ./path.sh
+. ./cmd.sh
+
+
+# Check required arguments
+[ -z "${train_set}" ] && { log "${help_message}"; log "Error: --train_set is required"; exit 2; };
+[ -z "${valid_set}" ] && { log "${help_message}"; log "Error: --valid_set is required"; exit 2; };
+[ -z "${test_sets}" ] && { log "${help_message}"; log "Error: --test_sets is required"; exit 2; };
+
+[ ${spk_num} -gt 1 ] && { log "${help_message}"; log "Error: --spk_num only 1 is supported"; exit 2; };
+
+# Check feature type
+if [ "${feats_type}" = raw ]; then
+    data_feats=${dumpdir}/raw
+elif [ "${feats_type}" = fbank_pitch ]; then
+    data_feats=${dumpdir}/fbank_pitch
+elif [ "${feats_type}" = fbank ]; then
+    data_feats=${dumpdir}/fbank
+elif [ "${feats_type}" == extracted ]; then
+    data_feats=${dumpdir}/extracted
+else
+    log "${help_message}"
+    log "Error: not supported: --feats_type ${feats_type}"
+    exit 2
+fi
+
+# Extra files for translation process
+utt_extra_files="text.${src_case}.${src_lang} text.${tgt_case}.${tgt_lang}"
+# Extra files for enhancement process
+utt_extra_files+=" utt2category"
+# Use the same text as ST for bpe training if not specified.
+[ -z "${src_bpe_train_text}" ] && src_bpe_train_text="${data_feats}/${train_set}/text.${src_case}.${src_lang}"
+[ -z "${tgt_bpe_train_text}" ] && tgt_bpe_train_text="${data_feats}/${train_set}/text.${tgt_case}.${tgt_lang}"
+# Use the same text as ST for lm training if not specified.
+[ -z "${lm_train_text}" ] && lm_train_text="${data_feats}/${train_set}/text.${tgt_case}.${tgt_lang}"
+# Use the same text as ST for lm training if not specified.
+[ -z "${lm_dev_text}" ] && lm_dev_text="${data_feats}/${valid_set}/text.${tgt_case}.${tgt_lang}"
+# Use the text of the 1st evaldir if lm_test is not specified
+[ -z "${lm_test_text}" ] && lm_test_text="${data_feats}/${test_sets%% *}/text.${tgt_case}.${tgt_lang}"
+
+# Check tokenization type
+if [ "${lang}" != noinfo ]; then
+    token_listdir=data/${lang}_token_list
+else
+    token_listdir=data/token_list
+fi
+# The tgt bpedir is set for all cases when using bpe
+tgt_bpedir="${token_listdir}/tgt_bpe_${tgt_bpemode}${tgt_nbpe}"
+tgt_bpeprefix="${tgt_bpedir}"/bpe
+tgt_bpemodel="${tgt_bpeprefix}".model
+tgt_bpetoken_list="${tgt_bpedir}"/tokens.txt
+tgt_chartoken_list="${token_listdir}"/char/tgt_tokens.txt
+if "${token_joint}"; then
+    # if token_joint, the bpe training will use both src_lang and tgt_lang to train a single bpe model
+    src_bpedir="${tgt_bpedir}"
+    src_bpeprefix="${tgt_bpeprefix}"
+    src_bpemodel="${tgt_bpemodel}"
+    src_bpetoken_list="${tgt_bpetoken_list}"
+    src_chartoken_list="${tgt_chartoken_list}"
+else
+    src_bpedir="${token_listdir}/src_bpe_${tgt_bpemode}${tgt_nbpe}"
+    src_bpeprefix="${src_bpedir}"/bpe
+    src_bpemodel="${src_bpeprefix}".model
+    src_bpetoken_list="${src_bpedir}"/tokens.txt
+    src_chartoken_list="${token_listdir}"/char/src_tokens.txt
+fi
+
+# NOTE: keep for future development.
+# shellcheck disable=SC2034
+tgt_wordtoken_list="${token_listdir}"/word/tgt_tokens.txt
+if "${token_joint}"; then
+    src_wordtoken_list="${tgt_wordtoken_list}"
+else
+    src_wordtoken_list="${token_listdir}"/word/src_tokens.txt
+fi
+
+# Set token types for src and tgt langs
+if [ "${src_token_type}" = bpe ]; then
+    src_token_list="${src_bpetoken_list}"
+elif [ "${src_token_type}" = char ]; then
+    src_token_list="${src_chartoken_list}"
+    src_bpemodel=none
+elif [ "${src_token_type}" = word ]; then
+    src_token_list="${src_wordtoken_list}"
+    src_bpemodel=none
+else
+    log "Error: not supported --src_token_type '${src_token_type}'"
+    exit 2
+fi
+if [ "${tgt_token_type}" = bpe ]; then
+    tgt_token_list="${tgt_bpetoken_list}"
+elif [ "${tgt_token_type}" = char ]; then
+    tgt_token_list="${tgt_chartoken_list}"
+    tgt_bpemodel=none
+elif [ "${tgt_token_type}" = word ]; then
+    tgt_token_list="${tgt_wordtoken_list}"
+    tgt_bpemodel=none
+else
+    log "Error: not supported --tgt_token_type '${tgt_token_type}'"
+    exit 2
+fi
+if ${use_word_lm}; then
+    log "Error: Word LM is not supported yet"
+    exit 2
+
+    lm_token_list="${tgt_wordtoken_list}"
+    lm_token_type=word
+else
+    lm_token_list="${tgt_token_list}"
+    lm_token_type="${tgt_token_type}"
+fi
+
+
+# Set tag for naming of model directory
+if [ -z "${enh_st_tag}" ]; then
+    if [ -n "${enh_st_config}" ]; then
+        enh_st_tag="$(basename "${enh_st_config}" .yaml)_${feats_type}"
+    else
+        enh_st_tag="train_${feats_type}"
+    fi
+    if [ "${lang}" != noinfo ]; then
+        enh_st_tag+="_${lang}_${tgt_token_type}_${tgt_case}"
+    else
+        enh_st_tag+="_${tgt_token_type}_${tgt_case}"
+    fi
+    if [ "${tgt_token_type}" = bpe ]; then
+        enh_st_tag+="${tgt_nbpe}"
+    fi
+    # Add overwritten arg's info
+    if [ -n "${enh_st_args}" ]; then
+        enh_st_tag+="$(echo "${enh_st_args}" | sed -e "s/--/\_/g" -e "s/[ |=/]//g")"
+    fi
+    if [ -n "${speed_perturb_factors}" ]; then
+        enh_st_tag+="_sp"
+    fi
+fi
+if [ -z "${lm_tag}" ]; then
+    if [ -n "${lm_config}" ]; then
+        lm_tag="$(basename "${lm_config}" .yaml)"
+    else
+        lm_tag="train"
+    fi
+    if [ "${lang}" != noinfo ]; then
+        lm_tag+="_${lang}_${lm_token_type}"
+    else
+        lm_tag+="_${lm_token_type}"
+    fi
+    if [ "${lm_token_type}" = bpe ]; then
+        lm_tag+="${tgt_nbpe}"
+    fi
+    # Add overwritten arg's info
+    if [ -n "${lm_args}" ]; then
+        lm_tag+="$(echo "${lm_args}" | sed -e "s/--/\_/g" -e "s/[ |=/]//g")"
+    fi
+fi
+
+# The directory used for collect-stats mode
+if [ -z "${enh_st_stats_dir}" ]; then
+    if [ "${lang}" != noinfo ]; then
+        enh_st_stats_dir="${expdir}/enh_st_stats_${feats_type}_${lang}_${tgt_token_type}"
+    else
+        enh_st_stats_dir="${expdir}/enh_st_stats_${feats_type}_${tgt_token_type}"
+    fi
+    if [ "${tgt_token_type}" = bpe ]; then
+        enh_st_stats_dir+="${tgt_nbpe}"
+    fi
+    if [ -n "${speed_perturb_factors}" ]; then
+        enh_st_stats_dir+="_sp"
+    fi
+fi
+if [ -z "${lm_stats_dir}" ]; then
+    if [ "${lang}" != noinfo ]; then
+        lm_stats_dir="${expdir}/lm_stats_${lang}_${lm_token_type}"
+    else
+        lm_stats_dir="${expdir}/lm_stats_${lm_token_type}"
+    fi
+    if [ "${lm_token_type}" = bpe ]; then
+        lm_stats_dir+="${tgt_nbpe}"
+    fi
+fi
+# The directory used for training commands
+if [ -z "${enh_st_exp}" ]; then
+    enh_st_exp="${expdir}/enh_st_${enh_st_tag}"
+fi
+if [ -z "${lm_exp}" ]; then
+    lm_exp="${expdir}/lm_${lm_tag}"
+fi
+if [ -z "${ngram_exp}" ]; then
+    ngram_exp="${expdir}/ngram"
+fi
+
+
+if [ -z "${inference_tag}" ]; then
+    if [ -n "${inference_config}" ]; then
+        inference_tag="$(basename "${inference_config}" .yaml)"
+    else
+        inference_tag=inference
+    fi
+    # Add overwritten arg's info
+    if [ -n "${st_inference_args}" ]; then
+        inference_tag+="$(echo "${st_inference_args}" | sed -e "s/--/\_/g" -e "s/[ |=]//g")"
+    fi
+    if [ -n "${enh_inference_args}" ]; then
+        inference_tag+="$(echo "${enh_inference_args}" | sed -e "s/--/\_/g" -e "s/[ |=]//g")"
+    fi
+    if "${use_lm}"; then
+        inference_tag+="_lm_$(basename "${lm_exp}")_$(echo "${inference_lm}" | sed -e "s/\//_/g" -e "s/\.[^.]*$//g")"
+    fi
+    if "${use_ngram}"; then
+        inference_tag+="_ngram_$(basename "${ngram_exp}")_$(echo "${inference_ngram}" | sed -e "s/\//_/g" -e "s/\.[^.]*$//g")"
+    fi
+    inference_tag+="_enh_st_model_$(echo "${inference_enh_st_model}" | sed -e "s/\//_/g" -e "s/\.[^.]*$//g")"
+
+    if "${use_k2}"; then
+      inference_tag+="_use_k2"
+    fi
+fi
+
+if [ -z "${inference_enh_tag}" ]; then
+    if [ -n "${inference_enh_config}" ]; then
+        inference_enh_tag="$(basename "${inference_enh_config}" .yaml)"
+    else
+        inference_enh_tag=enhanced
+    fi
+fi
+
+# ========================== Main stages start from here. ==========================
+
+if ! "${skip_data_prep}"; then
+    if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+        log "Stage 1: Data preparation for data/${train_set}, data/${valid_set}, etc."
+        # [Task dependent] Need to create data.sh for new corpus
+        local/data.sh ${local_data_opts}
+    fi
+
+    if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+        if [ -n "${speed_perturb_factors}" ]; then
+            log "Stage 2: Speed perturbation: data/${train_set} -> data/${train_set}_sp"
+
+            _scp_list="wav.scp "
+            for i in $(seq ${spk_num}); do
+                _scp_list+="spk${i}.scp "
+            done
+
+            for factor in ${speed_perturb_factors}; do
+                if [[ $(bc <<<"${factor} != 1.0") == 1 ]]; then
+                    scripts/utils/perturb_enh_data_dir_speed.sh --utt_extra_files "${utt_extra_files}" \
+                         "${factor}" "data/${train_set}" "data/${train_set}_sp${factor}" "${_scp_list}"
+                    _dirs+="data/${train_set}_sp${factor} "
+                else
+                    # If speed factor is 1, same as the original
+                    _dirs+="data/${train_set} "
+                fi
+            done
+            utils/combine_data.sh --extra_files "${utt_extra_files} ${_scp_list}" "data/${train_set}_sp" ${_dirs}
+            for extra_file in ${utt_extra_files}; do
+                python pyscripts/utils/remove_duplicate_keys.py data/"${train_set}_sp"/${extra_file} > data/"${train_set}_sp"/${extra_file}.tmp 
+                mv data/"${train_set}_sp"/${extra_file}.tmp data/"${train_set}_sp"/${extra_file}
+            done
+        else
+           log "Skip stage 2: Speed perturbation"
+        fi
+    fi
+
+    if [ -n "${speed_perturb_factors}" ]; then
+        train_set="${train_set}_sp"
+    fi
+
+    if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+        if [ "${feats_type}" = raw ]; then
+            log "Stage 3: Format wav.scp: data/ -> ${data_feats}"
+
+            # ====== Recreating "wav.scp" ======
+            # Kaldi-wav.scp, which can describe the file path with unix-pipe, like "cat /some/path |",
+            # shouldn't be used in training process.
+            # "format_wav_scp.sh" dumps such pipe-style-wav to real audio file
+            # and it can also change the audio-format and sampling rate.
+            # If nothing is need, then format_wav_scp.sh does nothing:
+            # i.e. the input file format and rate is same as the output.
+
+            for dset in "${train_set}" "${valid_set}" ${test_sets}; do
+                if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
+                    _suf="/org"
+                else
+                    _suf=""
+                fi
+                utils/copy_data_dir.sh --validate_opts --non-print data/"${dset}" "${data_feats}${_suf}/${dset}"
+
+                # expand the utt_extra_files for multi-references
+                expand_utt_extra_files=""
+                for extra_file in ${utt_extra_files}; do
+                    # with regex to suuport multi-references
+                    for single_file in "data/${dset}/${extra_file}"*; do
+                        if [ ! -f "${single_file}" ]; then
+                            continue
+                        fi
+                        cp ${single_file} "${data_feats}${_suf}/${dset}"
+                        expand_utt_extra_files="${expand_utt_extra_files} $(basename ${single_file})"
+                    done 
+                done
+                echo "${expand_utt_extra_files}"
+                utils/fix_data_dir.sh --utt_extra_files "${expand_utt_extra_files}" "${data_feats}${_suf}/${dset}"
+                for extra_file in ${expand_utt_extra_files}; do
+                    LC_ALL=C sort -u -k1,1 "${data_feats}${_suf}/${dset}/${extra_file}" -o "${data_feats}${_suf}/${dset}/${extra_file}"
+                done
+
+                rm -f ${data_feats}${_suf}/${dset}/{segments,wav.scp,reco2file_and_channel,reco2dur}
+                _opts=
+                if [ -e data/"${dset}"/segments ]; then
+                    # "segments" is used for splitting wav files which are written in "wav".scp
+                    # into utterances. The file format of segments:
+                    #   <segment_id> <record_id> <start_time> <end_time>
+                    #   "e.g. call-861225-A-0050-0065 call-861225-A 5.0 6.5"
+                    # Where the time is written in seconds.
+                    _opts+="--segments data/${dset}/segments "
+                fi
+
+                _spk_list=" "
+                for i in $(seq ${spk_num}); do
+                    _spk_list+="spk${i} "
+                done
+                if ${use_noise_ref} && [ -n "${_suf}" ]; then
+                    # references for denoising ("noise1 noise2 ... niose${noise_type_num} ")
+                    _spk_list+=$(for n in $(seq $noise_type_num); do echo -n "noise$n "; done)
+                fi
+                if ${use_dereverb_ref} && [ -n "${_suf}" ]; then
+                    # references for dereverberation
+                    _spk_list+=$(for n in $(seq $dereverb_ref_num); do echo -n "dereverb$n "; done)
+                fi
+
+                for spk in ${_spk_list} "wav" ; do
+                    # shellcheck disable=SC2086
+                    scripts/audio/format_wav_scp.sh --nj "${nj}" --cmd "${train_cmd}" \
+                        --out-filename "${spk}.scp" \
+                        --ref_channels "0" \
+                        --audio-format "${audio_format}" --fs "${fs}" ${_opts} \
+                        "data/${dset}/${spk}.scp" "${data_feats}${_suf}/${dset}" \
+                        "${data_feats}${_suf}/${dset}/logs/${spk}" "${data_feats}${_suf}/${dset}/data/${spk}"
+                done
+
+                echo "${feats_type}" > "${data_feats}${_suf}/${dset}/feats_type"
+            done
+
+        else
+            log "Error: not supported: --feats_type ${feats_type}"
+            exit 2
+        fi
+    fi
+
+
+    if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+        log "Stage 4: Remove long/short data: ${data_feats}/org -> ${data_feats}"
+
+        # NOTE(kamo): Not applying to test_sets to keep original data
+        for dset in "${train_set}" "${valid_set}"; do
+            # Copy data dir
+            _spk_list=" "
+            _scp_list=" "
+            for i in $(seq ${spk_num}); do
+                _spk_list+="spk${i} "
+                _scp_list+="spk${i}.scp "
+            done
+            if $use_noise_ref; then
+                # references for denoising ("noise1 noise2 ... niose${noise_type_num} ")
+                _spk_list+=$(for n in $(seq $noise_type_num); do echo -n "noise$n "; done)
+                _scp_list+=$(for n in $(seq $noise_type_num); do echo -n "noise$n.scp "; done)
+            fi
+            if $use_dereverb_ref; then
+                # references for dereverberation
+                _spk_list+=$(for n in $(seq $dereverb_ref_num); do echo -n "dereverb$n "; done)
+                _scp_list+=$(for n in $(seq $dereverb_ref_num); do echo -n "dereverb$n.scp "; done)
+            fi
+
+            # Copy data dir
+            utils/copy_data_dir.sh --validate_opts --non-print "${data_feats}/org/${dset}" "${data_feats}/${dset}"
+            cp "${data_feats}/org/${dset}/feats_type" "${data_feats}/${dset}/feats_type"
+
+            for utt_extra_file in ${utt_extra_files}; do
+                cp "${data_feats}/org/${dset}/${utt_extra_file}" "${data_feats}/${dset}"
+            done
+
+            # Remove short utterances
+            _feats_type="$(<${data_feats}/${dset}/feats_type)"
+            if [ "${_feats_type}" = raw ]; then
+                _fs=$(python3 -c "import humanfriendly as h;print(h.parse_size('${fs}'))")
+                _min_length=$(python3 -c "print(int(${min_wav_duration} * ${_fs}))")
+                _max_length=$(python3 -c "print(int(${max_wav_duration} * ${_fs}))")
+
+                # utt2num_samples is created by format_wav_scp.sh
+                <"${data_feats}/org/${dset}/utt2num_samples" \
+                    awk -v min_length="${_min_length}" -v max_length="${_max_length}" \
+                        '{ if ($2 > min_length && $2 < max_length ) print $0; }' \
+                        >"${data_feats}/${dset}/utt2num_samples"
+                for spk in ${_spk_list} "wav"; do
+                    <"${data_feats}/org/${dset}/${spk}.scp" \
+                        utils/filter_scp.pl "${data_feats}/${dset}/utt2num_samples"  \
+                        >"${data_feats}/${dset}/${spk}.scp"
+                done
+            else
+                # Get frame shift in ms from conf/fbank.conf
+                _frame_shift=
+                if [ -f conf/fbank.conf ] && [ "$(<conf/fbank.conf grep -c frame-shift)" -gt 0 ]; then
+                    # Assume using conf/fbank.conf for feature extraction
+                    _frame_shift="$(<conf/fbank.conf grep frame-shift | sed -e 's/[-a-z =]*\([0-9]*\)/\1/g')"
+                fi
+                if [ -z "${_frame_shift}" ]; then
+                    # If not existing, use the default number in Kaldi (=10ms).
+                    # If you are using different number, you have to change the following value manually.
+                    _frame_shift=10
+                fi
+
+                _min_length=$(python3 -c "print(int(${min_wav_duration} / ${_frame_shift} * 1000))")
+                _max_length=$(python3 -c "print(int(${max_wav_duration} / ${_frame_shift} * 1000))")
+
+                cp "${data_feats}/org/${dset}/feats_dim" "${data_feats}/${dset}/feats_dim"
+                <"${data_feats}/org/${dset}/feats_shape" awk -F, ' { print $1 } ' \
+                    | awk -v min_length="${_min_length}" -v max_length="${_max_length}" \
+                        '{ if ($2 > min_length && $2 < max_length) print $0; }' \
+                        >"${data_feats}/${dset}/feats_shape"
+                <"${data_feats}/org/${dset}/feats.scp" \
+                    utils/filter_scp.pl "${data_feats}/${dset}/feats_shape"  \
+                    >"${data_feats}/${dset}/feats.scp"
+            fi
+
+            # Remove empty text
+            <"${data_feats}/org/${dset}/text" \
+                awk ' { if( NF != 1 ) print $0; } ' >"${data_feats}/${dset}/text"
+
+            # fix_data_dir.sh leaves only utts which exist in all files
+            utils/fix_data_dir.sh --utt_extra_files "${utt_extra_files}" "${data_feats}/${dset}"
+            for utt_extra_file in ${utt_extra_files}; do
+                python pyscripts/utils/remove_duplicate_keys.py ${data_feats}/${dset}/${utt_extra_file} \
+                    > ${data_feats}/${dset}/${utt_extra_file}.tmp 
+                mv ${data_feats}/${dset}/${utt_extra_file}.tmp ${data_feats}/${dset}/${utt_extra_file}
+            done 
+        done
+
+        # shellcheck disable=SC2002
+        cat ${lm_train_text} | awk ' { if( NF != 1 ) print $0; } ' > "${data_feats}/lm_train.txt"
+    fi
+
+    if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+        # Combine source and target texts when using joint tokenization
+        if "${token_joint}"; then
+            log "Merge src and target data if joint BPE"
+
+            cat $tgt_bpe_train_text > ${data_feats}/${train_set}/text.${src_lang}_${tgt_lang}
+            [ -n "${src_bpe_train_text}" ] && cat ${src_bpe_train_text} >> ${data_feats}/${train_set}/text.${src_lang}_${tgt_lang}
+            # Set the new text as the target text
+            tgt_bpe_train_text="${data_feats}/${train_set}/text.${src_lang}_${tgt_lang}"
+        fi
+
+        # First generate tgt lang
+        if [ "${tgt_token_type}" = bpe ]; then
+            log "Stage 5a: Generate token_list from ${tgt_bpe_train_text} using BPE for tgt_lang"
+
+            mkdir -p "${tgt_bpedir}"
+            # shellcheck disable=SC2002
+            cat ${tgt_bpe_train_text} | cut -f 2- -d" "  > "${tgt_bpedir}"/train.txt
+
+            if [ -n "${tgt_bpe_nlsyms}" ]; then
+                _opts_spm="--user_defined_symbols=${tgt_bpe_nlsyms}"
+            else
+                _opts_spm=""
+            fi
+
+            spm_train \
+                --input="${tgt_bpedir}"/train.txt \
+                --vocab_size="${tgt_nbpe}" \
+                --model_type="${tgt_bpemode}" \
+                --model_prefix="${tgt_bpeprefix}" \
+                --character_coverage=${tgt_bpe_char_cover} \
+                --input_sentence_size="${tgt_bpe_input_sentence_size}" \
+                ${_opts_spm}
+
+            {
+            echo "${blank}"
+            echo "${oov}"
+            # Remove <unk>, <s>, </s> from the vocabulary
+            <"${tgt_bpeprefix}".vocab awk '{ if( NR != 1 && NR != 2 && NR != 3 ){ print $1; } }'
+            echo "${sos_eos}"
+            } > "${tgt_token_list}"
+
+        elif [ "${tgt_token_type}" = char ] || [ "${tgt_token_type}" = word ]; then
+            log "Stage 5a: Generate character level token_list from ${tgt_bpe_train_text}  for tgt_lang"
+
+            _opts="--non_linguistic_symbols ${nlsyms_txt}"
+
+            # shellcheck disable=SC2002
+            cat ${tgt_bpe_train_text} | cut -f 2- -d" "  > "${data_feats}"/token_train.txt
+
+            # The first symbol in token_list must be "<blank>" and the last must be also sos/eos:
+            # 0 is reserved for CTC-blank for ST and also used as ignore-index in the other task
+            ${python} -m espnet2.bin.tokenize_text  \
+                --token_type "${tgt_token_type}" \
+                --input "${data_feats}/token_train.txt" --output "${tgt_token_list}" ${_opts} \
+                --field 2- \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --write_vocabulary true \
+                --add_symbol "${blank}:0" \
+                --add_symbol "${oov}:1" \
+                --add_symbol "${sos_eos}:-1"
+
+        else
+            log "Error: not supported --token_type '${tgt_token_type}'"
+            exit 2
+        fi
+
+        # Create word-list for word-LM training
+        if ${use_word_lm} && [ "${tgt_token_type}" != word ]; then
+            log "Generate word level token_list from ${data_feats}/lm_train.txt"
+            ${python} -m espnet2.bin.tokenize_text \
+                --token_type word \
+                --input "${data_feats}/lm_train.txt" --output "${lm_token_list}" \
+                --field 2- \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --write_vocabulary true \
+                --vocabulary_size "${word_vocab_size}" \
+                --add_symbol "${blank}:0" \
+                --add_symbol "${oov}:1" \
+                --add_symbol "${sos_eos}:-1"
+        fi
+
+        # Then generate src lang
+        if "${token_joint}"; then
+            log "Stage 5b: Skip separate token construction for src_lang when setting ${token_joint} as true"
+        else
+            if [ "${src_token_type}" = bpe ]; then
+                log "Stage 5b: Generate token_list from ${src_bpe_train_text} using BPE for src_lang"
+
+                mkdir -p "${src_bpedir}"
+                # shellcheck disable=SC2002
+                cat ${src_bpe_train_text} | cut -f 2- -d" "  > "${src_bpedir}"/train.txt
+
+                if [ -n "${src_bpe_nlsyms}" ]; then
+                    _opts_spm="--user_defined_symbols=${src_bpe_nlsyms}"
+                else
+                    _opts_spm=""
+                fi
+
+                spm_train \
+                    --input="${src_bpedir}"/train.txt \
+                    --vocab_size="${src_nbpe}" \
+                    --model_type="${src_bpemode}" \
+                    --model_prefix="${src_bpeprefix}" \
+                    --character_coverage=${src_bpe_char_cover} \
+                    --input_sentence_size="${src_bpe_input_sentence_size}" \
+                    ${_opts_spm}
+
+                {
+                echo "${blank}"
+                echo "${oov}"
+                # Remove <unk>, <s>, </s> from the vocabulary
+                <"${src_bpeprefix}".vocab awk '{ if( NR != 1 && NR != 2 && NR != 3 ){ print $1; } }'
+                echo "${sos_eos}"
+                } > "${src_token_list}"
+
+            elif [ "${src_token_type}" = char ] || [ "${src_token_type}" = word ]; then
+                log "Stage 5b: Generate character level token_list from ${src_bpe_train_text}  for src_lang"
+
+                _opts="--non_linguistic_symbols ${nlsyms_txt}"
+
+                # shellcheck disable=SC2002
+                cat ${src_bpe_train_text} | cut -f 2- -d" "  > "${data_feats}"/token_train.txt
+
+                # The first symbol in token_list must be "<blank>" and the last must be also sos/eos:
+                # 0 is reserved for CTC-blank for ST and also used as ignore-index in the other task
+                ${python} -m espnet2.bin.tokenize_text  \
+                    --token_type "${src_token_type}" \
+                    --input "${data_feats}/token_train.txt" --output "${src_token_list}" ${_opts} \
+                    --field 2- \
+                    --cleaner "${cleaner}" \
+                    --g2p "${g2p}" \
+                    --write_vocabulary true \
+                    --add_symbol "${blank}:0" \
+                    --add_symbol "${oov}:1" \
+                    --add_symbol "${sos_eos}:-1"
+
+            else
+                log "Error: not supported --token_type '${src_token_type}'"
+                exit 2
+            fi
+
+
+        fi
+    fi
+
+else
+    log "Skip the stages for data preparation"
+fi
+
+
+# ========================== Data preparation is done here. ==========================
+
+
+if ! "${skip_train}"; then
+    if "${use_lm}"; then
+        if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+            log "Stage 6: LM collect stats: train_set=${data_feats}/lm_train.txt, dev_set=${lm_dev_text}"
+
+            _opts=
+            if [ -n "${lm_config}" ]; then
+                # To generate the config file: e.g.
+                #   % python3 -m espnet2.bin.lm_train --print_config --optim adam
+                _opts+="--config ${lm_config} "
+            fi
+
+            # 1. Split the key file
+            _logdir="${lm_stats_dir}/logdir"
+            mkdir -p "${_logdir}"
+            # Get the minimum number among ${nj} and the number lines of input files
+            _nj=$(min "${nj}" "$(<${data_feats}/lm_train.txt wc -l)" "$(<${lm_dev_text} wc -l)")
+
+            key_file="${data_feats}/lm_train.txt"
+            split_scps=""
+            for n in $(seq ${_nj}); do
+                split_scps+=" ${_logdir}/train.${n}.scp"
+            done
+            # shellcheck disable=SC2086
+            utils/split_scp.pl "${key_file}" ${split_scps}
+
+            key_file="${lm_dev_text}"
+            split_scps=""
+            for n in $(seq ${_nj}); do
+                split_scps+=" ${_logdir}/dev.${n}.scp"
+            done
+            # shellcheck disable=SC2086
+            utils/split_scp.pl "${key_file}" ${split_scps}
+
+            # 2. Generate run.sh
+            log "Generate '${lm_stats_dir}/run.sh'. You can resume the process from stage 6 using this script"
+            mkdir -p "${lm_stats_dir}"; echo "${run_args} --stage 6 \"\$@\"; exit \$?" > "${lm_stats_dir}/run.sh"; chmod +x "${lm_stats_dir}/run.sh"
+
+            # 3. Submit jobs
+            log "LM collect-stats started... log: '${_logdir}/stats.*.log'"
+            # NOTE: --*_shape_file doesn't require length information if --batch_type=unsorted,
+            #       but it's used only for deciding the sample ids.
+            # shellcheck disable=SC2086
+            ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \
+                ${python} -m espnet2.bin.lm_train \
+                    --collect_stats true \
+                    --use_preprocessor true \
+                    --bpemodel "${tgt_bpemodel}" \
+                    --token_type "${lm_token_type}"\
+                    --token_list "${lm_token_list}" \
+                    --non_linguistic_symbols "${nlsyms_txt}" \
+                    --cleaner "${cleaner}" \
+                    --g2p "${g2p}" \
+                    --train_data_path_and_name_and_type "${data_feats}/lm_train.txt,text,text" \
+                    --valid_data_path_and_name_and_type "${lm_dev_text},text,text" \
+                    --train_shape_file "${_logdir}/train.JOB.scp" \
+                    --valid_shape_file "${_logdir}/dev.JOB.scp" \
+                    --output_dir "${_logdir}/stats.JOB" \
+                    ${_opts} ${lm_args} || { cat "${_logdir}"/stats.1.log; exit 1; }
+
+            # 4. Aggregate shape files
+            _opts=
+            for i in $(seq "${_nj}"); do
+                _opts+="--input_dir ${_logdir}/stats.${i} "
+            done
+            # shellcheck disable=SC2086
+            ${python} -m espnet2.bin.aggregate_stats_dirs ${_opts} --output_dir "${lm_stats_dir}"
+
+            # Append the num-tokens at the last dimensions. This is used for batch-bins count
+            <"${lm_stats_dir}/train/text_shape" \
+                awk -v N="$(<${lm_token_list} wc -l)" '{ print $0 "," N }' \
+                >"${lm_stats_dir}/train/text_shape.${lm_token_type}"
+
+            <"${lm_stats_dir}/valid/text_shape" \
+                awk -v N="$(<${lm_token_list} wc -l)" '{ print $0 "," N }' \
+                >"${lm_stats_dir}/valid/text_shape.${lm_token_type}"
+        fi
+
+
+        if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+            log "Stage 7: LM Training: train_set=${data_feats}/lm_train.txt, dev_set=${lm_dev_text}"
+
+            _opts=
+            if [ -n "${lm_config}" ]; then
+                # To generate the config file: e.g.
+                #   % python3 -m espnet2.bin.lm_train --print_config --optim adam
+                _opts+="--config ${lm_config} "
+            fi
+
+            if [ "${num_splits_lm}" -gt 1 ]; then
+                # If you met a memory error when parsing text files, this option may help you.
+                # The corpus is split into subsets and each subset is used for training one by one in order,
+                # so the memory footprint can be limited to the memory required for each dataset.
+
+                _split_dir="${lm_stats_dir}/splits${num_splits_lm}"
+                if [ ! -f "${_split_dir}/.done" ]; then
+                    rm -f "${_split_dir}/.done"
+                    ${python} -m espnet2.bin.split_scps \
+                      --scps "${data_feats}/lm_train.txt" "${lm_stats_dir}/train/text_shape.${lm_token_type}" \
+                      --num_splits "${num_splits_lm}" \
+                      --output_dir "${_split_dir}"
+                    touch "${_split_dir}/.done"
+                else
+                    log "${_split_dir}/.done exists. Spliting is skipped"
+                fi
+
+                _opts+="--train_data_path_and_name_and_type ${_split_dir}/lm_train.txt,text,text "
+                _opts+="--train_shape_file ${_split_dir}/text_shape.${lm_token_type} "
+                _opts+="--multiple_iterator true "
+
+            else
+                _opts+="--train_data_path_and_name_and_type ${data_feats}/lm_train.txt,text,text "
+                _opts+="--train_shape_file ${lm_stats_dir}/train/text_shape.${lm_token_type} "
+            fi
+
+            # NOTE(kamo): --fold_length is used only if --batch_type=folded and it's ignored in the other case
+
+            log "Generate '${lm_exp}/run.sh'. You can resume the process from stage 7 using this script"
+            mkdir -p "${lm_exp}"; echo "${run_args} --stage 7 \"\$@\"; exit \$?" > "${lm_exp}/run.sh"; chmod +x "${lm_exp}/run.sh"
+
+            log "LM training started... log: '${lm_exp}/train.log'"
+            if echo "${cuda_cmd}" | grep -e queue.pl -e queue-freegpu.pl &> /dev/null; then
+                # SGE can't include "/" in a job name
+                jobname="$(basename ${lm_exp})"
+            else
+                jobname="${lm_exp}/train.log"
+            fi
+
+            # TODO(jiatong): fix bpe
+            # shellcheck disable=SC2086
+            ${python} -m espnet2.bin.launch \
+                --cmd "${cuda_cmd} --name ${jobname}" \
+                --log "${lm_exp}"/train.log \
+                --ngpu "${ngpu}" \
+                --num_nodes "${num_nodes}" \
+                --init_file_prefix "${lm_exp}"/.dist_init_ \
+                --multiprocessing_distributed true -- \
+                ${python} -m espnet2.bin.lm_train \
+                    --ngpu "${ngpu}" \
+                    --use_preprocessor true \
+                    --bpemodel "${tgt_bpemodel}" \
+                    --token_type "${lm_token_type}"\
+                    --token_list "${lm_token_list}" \
+                    --non_linguistic_symbols "${nlsyms_txt}" \
+                    --cleaner "${cleaner}" \
+                    --g2p "${g2p}" \
+                    --valid_data_path_and_name_and_type "${lm_dev_text},text,text" \
+                    --valid_shape_file "${lm_stats_dir}/valid/text_shape.${lm_token_type}" \
+                    --fold_length "${lm_fold_length}" \
+                    --resume true \
+                    --output_dir "${lm_exp}" \
+                    ${_opts} ${lm_args}
+
+        fi
+
+
+        if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
+            log "Stage 8: Calc perplexity: ${lm_test_text}"
+            _opts=
+            # TODO(kamo): Parallelize?
+            log "Perplexity calculation started... log: '${lm_exp}/perplexity_test/lm_calc_perplexity.log'"
+            # shellcheck disable=SC2086
+            ${cuda_cmd} --gpu "${ngpu}" "${lm_exp}"/perplexity_test/lm_calc_perplexity.log \
+                ${python} -m espnet2.bin.lm_calc_perplexity \
+                    --ngpu "${ngpu}" \
+                    --data_path_and_name_and_type "${lm_test_text},text,text" \
+                    --train_config "${lm_exp}"/config.yaml \
+                    --model_file "${lm_exp}/${inference_lm}" \
+                    --output_dir "${lm_exp}/perplexity_test" \
+                    ${_opts}
+            log "PPL: ${lm_test_text}: $(cat ${lm_exp}/perplexity_test/ppl)"
+
+        fi
+
+    else
+        log "Stage 6-8: Skip lm-related stages: use_lm=${use_lm}"
+    fi
+
+
+    if "${use_ngram}"; then
+        mkdir -p ${ngram_exp}
+    fi
+    if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
+        if "${use_ngram}"; then
+            log "Stage 9: Ngram Training: train_set=${data_feats}/lm_train.txt"
+            cut -f 2 -d " " ${data_feats}/lm_train.txt | lmplz -S "20%" --discount_fallback -o ${ngram_num} - >${ngram_exp}/${ngram_num}gram.arpa
+            build_binary -s ${ngram_exp}/${ngram_num}gram.arpa ${ngram_exp}/${ngram_num}gram.bin 
+        else
+            log "Stage 9: Skip ngram stages: use_ngram=${use_ngram}"
+        fi
+    fi
+
+
+    if [ ${stage} -le 10 ] && [ ${stop_stage} -ge 10 ]; then
+        _enh_st_train_dir="${data_feats}/${train_set}"
+        _enh_st_valid_dir="${data_feats}/${valid_set}"
+        log "Stage 10: ST collect stats: train_set=${_enh_st_train_dir}, valid_set=${_enh_st_valid_dir}"
+
+        _opts=
+        if [ -n "${enh_st_config}" ]; then
+            # To generate the config file: e.g.
+            #   % python3 -m espnet2.bin.enh_s2t_train --print_config --optim adam
+            _opts+="--config ${enh_st_config} "
+        fi
+
+        _feats_type="$(<${_enh_st_train_dir}/feats_type)"
+        if [ "${_feats_type}" = raw ]; then
+            _scp=wav.scp
+            if [[ "${audio_format}" == *ark* ]]; then
+                _type=kaldi_ark
+            else
+                # "sound" supports "wav", "flac", etc.
+                _type=sound
+            fi
+            _opts+="--frontend_conf fs=${fs} "
+        else
+            _scp=feats.scp
+            _type=kaldi_ark
+            _input_size="$(<${_enh_st_train_dir}/feats_dim)"
+            _opts+="--input_size=${_input_size} "
+        fi
+
+        # 1. Split the key file
+        _logdir="${enh_st_stats_dir}/logdir"
+        mkdir -p "${_logdir}"
+
+        # Get the minimum number among ${nj} and the number lines of input files
+        _nj=$(min "${nj}" "$(<${_enh_st_train_dir}/${_scp} wc -l)" "$(<${_enh_st_valid_dir}/${_scp} wc -l)")
+
+        key_file="${_enh_st_train_dir}/${_scp}"
+        split_scps=""
+        for n in $(seq "${_nj}"); do
+            split_scps+=" ${_logdir}/train.${n}.scp"
+        done
+        # shellcheck disable=SC2086
+        utils/split_scp.pl "${key_file}" ${split_scps}
+
+        key_file="${_enh_st_valid_dir}/${_scp}"
+        split_scps=""
+        for n in $(seq "${_nj}"); do
+            split_scps+=" ${_logdir}/valid.${n}.scp"
+        done
+        # shellcheck disable=SC2086
+        utils/split_scp.pl "${key_file}" ${split_scps}
+
+        # 2. Generate run.sh
+        log "Generate '${enh_st_stats_dir}/run.sh'. You can resume the process from stage 10 using this script"
+        mkdir -p "${enh_st_stats_dir}"; echo "${run_args} --stage 10 \"\$@\"; exit \$?" > "${enh_st_stats_dir}/run.sh"; chmod +x "${enh_st_stats_dir}/run.sh"
+
+        # 3. Submit jobs
+        log "ST collect-stats started... log: '${_logdir}/stats.*.log'"
+
+        # NOTE: --*_shape_file doesn't require length information if --batch_type=unsorted,
+        #       but it's used only for deciding the sample ids.
+
+        # TODO(jiatong): fix different bpe model
+        # shellcheck disable=SC2086
+        ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \
+            ${python} -m espnet2.bin.enh_s2t_train \
+                --collect_stats true \
+                --use_preprocessor true \
+                --bpemodel "${tgt_bpemodel}" \
+                --src_bpemodel "${src_bpemodel}" \
+                --token_type "${tgt_token_type}" \
+                --src_token_type "${src_token_type}" \
+                --token_list "${tgt_token_list}" \
+                --src_token_list "${src_token_list}" \
+                --non_linguistic_symbols "${nlsyms_txt}" \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --train_data_path_and_name_and_type "${_enh_st_train_dir}/${_scp},speech,${_type}" \
+                --train_data_path_and_name_and_type "${_enh_st_train_dir}/${_scp},speech_ref1,${_type}" \
+                --train_data_path_and_name_and_type "${_enh_st_train_dir}/text.${tgt_case}.${tgt_lang},text,text" \
+                --train_data_path_and_name_and_type "${_enh_st_train_dir}/text.${src_case}.${src_lang},src_text,text" \
+                --valid_data_path_and_name_and_type "${_enh_st_valid_dir}/${_scp},speech,${_type}" \
+                --valid_data_path_and_name_and_type "${_enh_st_valid_dir}/${_scp},speech_ref1,${_type}" \
+                --valid_data_path_and_name_and_type "${_enh_st_valid_dir}/text.${tgt_case}.${tgt_lang},text,text" \
+                --valid_data_path_and_name_and_type "${_enh_st_valid_dir}/text.${src_case}.${src_lang},src_text,text" \
+                --train_shape_file "${_logdir}/train.JOB.scp" \
+                --valid_shape_file "${_logdir}/valid.JOB.scp" \
+                --output_dir "${_logdir}/stats.JOB" \
+                ${_opts} ${enh_st_args} || { cat "${_logdir}"/stats.1.log; exit 1; }
+
+        # 4. Aggregate shape files
+        _opts=
+        for i in $(seq "${_nj}"); do
+            _opts+="--input_dir ${_logdir}/stats.${i} "
+        done
+        # shellcheck disable=SC2086
+        ${python} -m espnet2.bin.aggregate_stats_dirs ${_opts} --output_dir "${enh_st_stats_dir}"
+
+        # Append the num-tokens at the last dimensions. This is used for batch-bins count
+        <"${enh_st_stats_dir}/train/text_shape" \
+            awk -v N="$(<${tgt_token_list} wc -l)" '{ print $0 "," N }' \
+            >"${enh_st_stats_dir}/train/text_shape.${tgt_token_type}"
+
+        <"${enh_st_stats_dir}/train/src_text_shape" \
+            awk -v N="$(<${src_token_list} wc -l)" '{ print $0 "," N }' \
+            >"${enh_st_stats_dir}/train/src_text_shape.${src_token_type}"
+
+        <"${enh_st_stats_dir}/valid/text_shape" \
+            awk -v N="$(<${tgt_token_list} wc -l)" '{ print $0 "," N }' \
+            >"${enh_st_stats_dir}/valid/text_shape.${tgt_token_type}"
+
+        <"${enh_st_stats_dir}/valid/src_text_shape" \
+            awk -v N="$(<${src_token_list} wc -l)" '{ print $0 "," N }' \
+            >"${enh_st_stats_dir}/valid/src_text_shape.${src_token_type}"
+    fi
+
+
+    if [ ${stage} -le 11 ] && [ ${stop_stage} -ge 11 ]; then
+        _enh_st_train_dir="${data_feats}/${train_set}"
+        _enh_st_valid_dir="${data_feats}/${valid_set}"
+        log "Stage 11: ST Training: train_set=${_enh_st_train_dir}, valid_set=${_enh_st_valid_dir}"
+
+        _opts=
+        if [ -n "${enh_st_config}" ]; then
+            # To generate the config file: e.g.
+            #   % python3 -m espnet2.bin.enh_s2t_train --print_config --optim adam
+            _opts+="--config ${enh_st_config} "
+        fi
+
+        _feats_type="$(<${_enh_st_train_dir}/feats_type)"
+        if [ "${_feats_type}" = raw ]; then
+            _scp=wav.scp
+            # "sound" supports "wav", "flac", etc.
+            if [[ "${audio_format}" == *ark* ]]; then
+                _type=kaldi_ark
+            else
+                _type=sound
+            fi
+            _fold_length="$((enh_st_speech_fold_length * 100))"
+            _opts+="--frontend_conf fs=${fs} "
+        else
+            _scp=feats.scp
+            _type=kaldi_ark
+            _fold_length="${enh_st_speech_fold_length}"
+            _input_size="$(<${_enh_st_train_dir}/feats_dim)"
+            _opts+="--input_size=${_input_size} "
+
+        fi
+        if [ "${feats_normalize}" = global_mvn ]; then
+            # Default normalization is utterance_mvn and changes to global_mvn
+            _opts+="--normalize=global_mvn --normalize_conf stats_file=${enh_st_stats_dir}/train/feats_stats.npz "
+        fi
+
+        if [ "${num_splits_st}" -gt 1 ]; then
+            # If you met a memory error when parsing text files, this option may help you.
+            # The corpus is split into subsets and each subset is used for training one by one in order,
+            # so the memory footprint can be limited to the memory required for each dataset.
+
+            _split_dir="${enh_st_stats_dir}/splits${num_splits_st}"
+            if [ ! -f "${_split_dir}/.done" ]; then
+                rm -f "${_split_dir}/.done"
+                ${python} -m espnet2.bin.split_scps \
+                  --scps \
+                      "${_enh_st_train_dir}/${_scp}" \
+                      "${_enh_st_train_dir}/text.${tgt_case}.${tgt_lang}" \
+                      "${_enh_st_train_dir}/text.${src_case}.${src_lang}" \
+                      "${enh_st_stats_dir}/train/speech_shape" \
+                      "${enh_st_stats_dir}/train/speech_ref1_shape" \
+                      "${enh_st_stats_dir}/train/text_shape.${tgt_token_type}" \
+                      "${enh_st_stats_dir}/train/src_text_shape.${src_token_type}" \
+                  --num_splits "${num_splits_st}" \
+                  --output_dir "${_split_dir}"
+                touch "${_split_dir}/.done"
+            else
+                log "${_split_dir}/.done exists. Spliting is skipped"
+            fi
+
+            _opts+="--train_data_path_and_name_and_type ${_split_dir}/${_scp},speech,${_type} "
+            _opts+="--train_data_path_and_name_and_type ${_split_dir}/spk1.scp,speech_ref1,${_type} "
+            _opts+="--train_data_path_and_name_and_type ${_split_dir}/text.${tgt_case}.${tgt_lang},text,text "
+            _opts+="--train_data_path_and_name_and_type ${_split_dir}/text.${src_case}.${src_lang},src_text,text "
+            _opts+="--train_shape_file ${_split_dir}/speech_shape "
+            _opts+="--train_shape_file ${_split_dir}/speech_ref1_shape "
+            _opts+="--train_shape_file ${_split_dir}/text_shape.${tgt_token_type} "
+            _opts+="--train_shape_file ${_split_dir}/src_text_shape.${src_token_type} "
+            _opts+="--multiple_iterator true "
+        else
+            _opts+="--train_data_path_and_name_and_type ${_enh_st_train_dir}/${_scp},speech,${_type} "
+            _opts+="--train_data_path_and_name_and_type ${_enh_st_train_dir}/spk1.scp,speech_ref1,${_type} "
+            _opts+="--train_data_path_and_name_and_type ${_enh_st_train_dir}/text.${tgt_case}.${tgt_lang},text,text "
+            _opts+="--train_data_path_and_name_and_type ${_enh_st_train_dir}/text.${src_case}.${src_lang},src_text,text "
+            _opts+="--train_shape_file ${enh_st_stats_dir}/train/speech_shape "
+            _opts+="--train_shape_file ${enh_st_stats_dir}/train/speech_ref1_shape "
+            _opts+="--train_shape_file ${enh_st_stats_dir}/train/text_shape.${tgt_token_type} "
+            _opts+="--train_shape_file ${enh_st_stats_dir}/train/src_text_shape.${src_token_type} "
+        fi
+
+        log "Generate '${enh_st_exp}/run.sh'. You can resume the process from stage 11 using this script"
+        mkdir -p "${enh_st_exp}"; echo "${run_args} --stage 11 \"\$@\"; exit \$?" > "${enh_st_exp}/run.sh"; chmod +x "${enh_st_exp}/run.sh"
+
+        # NOTE(kamo): --fold_length is used only if --batch_type=folded and it's ignored in the other case
+        log "ST training started... log: '${enh_st_exp}/train.log'"
+        if echo "${cuda_cmd}" | grep -e queue.pl -e queue-freegpu.pl &> /dev/null; then
+            # SGE can't include "/" in a job name
+            jobname="$(basename ${enh_st_exp})"
+        else
+            jobname="${enh_st_exp}/train.log"
+        fi
+
+        # TODO(jiatong): fix bpe
+        # shellcheck disable=SC2086
+        ${python} -m espnet2.bin.launch \
+            --cmd "${cuda_cmd} --name ${jobname}" \
+            --log "${enh_st_exp}"/train.log \
+            --ngpu "${ngpu}" \
+            --num_nodes "${num_nodes}" \
+            --init_file_prefix "${enh_st_exp}"/.dist_init_ \
+            --multiprocessing_distributed true -- \
+            ${python} -m espnet2.bin.enh_s2t_train \
+                --use_preprocessor true \
+                --bpemodel "${tgt_bpemodel}" \
+                --token_type "${tgt_token_type}" \
+                --token_list "${tgt_token_list}" \
+                --src_bpemodel "${src_bpemodel}" \
+                --src_token_type "${src_token_type}" \
+                --src_token_list "${src_token_list}" \
+                --non_linguistic_symbols "${nlsyms_txt}" \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --valid_data_path_and_name_and_type "${_enh_st_valid_dir}/${_scp},speech,${_type}" \
+                --valid_data_path_and_name_and_type "${_enh_st_valid_dir}/spk1.scp,speech_ref1,${_type}" \
+                --valid_data_path_and_name_and_type "${_enh_st_valid_dir}/text.${tgt_case}.${tgt_lang},text,text" \
+                --valid_data_path_and_name_and_type "${_enh_st_valid_dir}/text.${src_case}.${src_lang},src_text,text" \
+                --valid_shape_file "${enh_st_stats_dir}/valid/speech_shape" \
+                --valid_shape_file "${enh_st_stats_dir}/valid/speech_ref1_shape" \
+                --valid_shape_file "${enh_st_stats_dir}/valid/text_shape.${tgt_token_type}" \
+                --valid_shape_file "${enh_st_stats_dir}/valid/src_text_shape.${src_token_type}" \
+                --resume true \
+                --init_param ${pretrained_model} \
+                --ignore_init_mismatch ${ignore_init_mismatch} \
+                --fold_length "${_fold_length}" \
+                --fold_length "${_fold_length}" \
+                --fold_length "${enh_st_text_fold_length}" \
+                --fold_length "${enh_st_text_fold_length}" \
+                --output_dir "${enh_st_exp}" \
+                ${_opts} ${enh_st_args}
+
+    fi
+else
+    log "Skip the training stages"
+fi
+
+
+if [ -n "${download_model}" ]; then
+    log "Use ${download_model} for decoding and evaluation"
+    enh_st_exp="${expdir}/${download_model}"
+    mkdir -p "${enh_st_exp}"
+
+    # If the model already exists, you can skip downloading
+    espnet_model_zoo_download --unpack true "${download_model}" > "${enh_st_exp}/config.txt"
+
+    # Get the path of each file
+    _enh_st_model_file=$(<"${enh_st_exp}/config.txt" sed -e "s/.*'enh_s2t_model_file': '\([^']*\)'.*$/\1/")
+    _enh_st_train_config=$(<"${enh_st_exp}/config.txt" sed -e "s/.*'enh_s2t_train_config': '\([^']*\)'.*$/\1/")
+
+    # Create symbolic links
+    ln -sf "${_enh_st_model_file}" "${enh_st_exp}"
+    ln -sf "${_enh_st_train_config}" "${enh_st_exp}"
+    inference_enh_st_model=$(basename "${_enh_st_model_file}")
+
+    if [ "$(<${enh_st_exp}/config.txt grep -c lm_file)" -gt 0 ]; then
+        _lm_file=$(<"${enh_st_exp}/config.txt" sed -e "s/.*'lm_file': '\([^']*\)'.*$/\1/")
+        _lm_train_config=$(<"${enh_st_exp}/config.txt" sed -e "s/.*'lm_train_config': '\([^']*\)'.*$/\1/")
+
+        lm_exp="${expdir}/${download_model}/lm"
+        mkdir -p "${lm_exp}"
+
+        ln -sf "${_lm_file}" "${lm_exp}"
+        ln -sf "${_lm_train_config}" "${lm_exp}"
+        inference_lm=$(basename "${_lm_file}")
+    fi
+
+fi
+
+
+if ! "${skip_eval}"; then
+    if [ ${stage} -le 12 ] && [ ${stop_stage} -ge 12 ]; then
+        log "Stage 12: Decoding: training_dir=${enh_st_exp}"
+
+        if ${gpu_inference}; then
+            _cmd="${cuda_cmd}"
+            _ngpu=1
+        else
+            _cmd="${decode_cmd}"
+            _ngpu=0
+        fi
+
+        _opts=
+        if [ -n "${inference_config}" ]; then
+            _opts+="--config ${inference_config} "
+        fi
+        if "${use_lm}"; then
+            if "${use_word_lm}"; then
+                _opts+="--word_lm_train_config ${lm_exp}/config.yaml "
+                _opts+="--word_lm_file ${lm_exp}/${inference_lm} "
+            else
+                _opts+="--lm_train_config ${lm_exp}/config.yaml "
+                _opts+="--lm_file ${lm_exp}/${inference_lm} "
+            fi
+        fi
+        if "${use_ngram}"; then
+             _opts+="--ngram_file ${ngram_exp}/${inference_ngram}"
+        fi
+
+        # 2. Generate run.sh
+        log "Generate '${enh_st_exp}/${inference_tag}/run.sh'. You can resume the process from stage 12 using this script"
+        mkdir -p "${enh_st_exp}/${inference_tag}"; echo "${run_args} --stage 12 \"\$@\"; exit \$?" > "${enh_st_exp}/${inference_tag}/run.sh"; chmod +x "${enh_st_exp}/${inference_tag}/run.sh"
+
+        for dset in ${test_sets}; do
+            _data="${data_feats}/${dset}"
+            _dir="${enh_st_exp}/${inference_tag}/${dset}"
+            _logdir="${_dir}/logdir"
+            mkdir -p "${_logdir}"
+
+            _feats_type="$(<${_data}/feats_type)"
+            if [ "${_feats_type}" = raw ]; then
+                _scp=wav.scp
+                if [[ "${audio_format}" == *ark* ]]; then
+                    _type=kaldi_ark
+                else
+                    _type=sound
+                fi
+            else
+                _scp=feats.scp
+                _type=kaldi_ark
+            fi
+
+            # 1. Split the key file
+            key_file=${_data}/${_scp}
+            split_scps=""
+            _nj=$(min "${inference_nj}" "$(<${key_file} wc -l)")
+            st_inference_tool="espnet2.bin.st_inference"
+
+            for n in $(seq "${_nj}"); do
+                split_scps+=" ${_logdir}/keys.${n}.scp"
+            done
+            # shellcheck disable=SC2086
+            utils/split_scp.pl "${key_file}" ${split_scps}
+
+            # 2. Submit decoding jobs
+            log "Decoding started... log: '${_logdir}/st_inference.*.log'"
+            # shellcheck disable=SC2086
+            ${_cmd} --gpu "${_ngpu}" JOB=1:"${_nj}" "${_logdir}"/st_inference.JOB.log \
+                ${python} -m ${st_inference_tool} \
+                    --enh_s2t_task true \
+                    --batch_size ${batch_size} \
+                    --ngpu "${_ngpu}" \
+                    --data_path_and_name_and_type "${_data}/${_scp},speech,${_type}" \
+                    --key_file "${_logdir}"/keys.JOB.scp \
+                    --st_train_config "${enh_st_exp}"/config.yaml \
+                    --st_model_file "${enh_st_exp}"/"${inference_enh_st_model}" \
+                    --output_dir "${_logdir}"/output.JOB \
+                    ${_opts} ${st_inference_args}
+
+            # 3. Concatenates the output files from each jobs
+            for f in token token_int score text; do
+                for i in $(seq "${_nj}"); do
+                    cat "${_logdir}/output.${i}/1best_recog/${f}"
+                done | LC_ALL=C sort -k1 >"${_dir}/${f}"
+            done
+
+        done
+    fi
+    if [ ${stage} -le 13 ] && [ ${stop_stage} -ge 13 ]; then
+        log "Stage 13: Enhance Speech: training_dir=${enh_st_exp}"
+
+        if ${gpu_inference}; then
+            _cmd="${cuda_cmd}"
+            _ngpu=1
+        else
+            _cmd="${decode_cmd}"
+            _ngpu=0
+        fi
+
+        _opts=
+
+        # 2. Generate run.sh
+        log "Generate '${enh_st_exp}/run_enhance.sh'. You can resume the process from stage 13 using this script"
+        mkdir -p "${enh_st_exp}"; echo "${run_args} --stage 13 \"\$@\"; exit \$?" > "${enh_st_exp}/run_enhance.sh"; chmod +x "${enh_st_exp}/run_enhance.sh"
+
+        for dset in ${test_sets}; do
+            _data="${data_feats}/${dset}"
+            _dir="${enh_st_exp}/${inference_enh_tag}_${dset}"
+            _logdir="${_dir}/logdir"
+            mkdir -p "${_logdir}"
+
+            _scp=wav.scp
+            if [[ "${audio_format}" == *ark* ]]; then
+                _type=kaldi_ark
+            else
+                _type=sound
+            fi
+
+            # 1. Split the key file
+            key_file=${_data}/${_scp}
+            split_scps=""
+            _nj=$(min "${inference_nj}" "$(<${key_file} wc -l)")
+            for n in $(seq "${_nj}"); do
+                split_scps+=" ${_logdir}/keys.${n}.scp"
+            done
+            # shellcheck disable=SC2086
+            utils/split_scp.pl "${key_file}" ${split_scps}
+
+            # 2. Submit inference jobs
+            log "Enhancement started... log: '${_logdir}/enh_inference.*.log'"
+            # shellcheck disable=SC2086
+            ${_cmd} --gpu "${_ngpu}" JOB=1:"${_nj}" "${_logdir}"/enh_inference.JOB.log \
+                ${python} -m espnet2.bin.enh_inference \
+                    --enh_s2t_task true \
+                    --ngpu "${_ngpu}" \
+                    --fs "${fs}" \
+                    --data_path_and_name_and_type "${_data}/${_scp},speech_mix,${_type}" \
+                    --key_file "${_logdir}"/keys.JOB.scp \
+                    --train_config "${enh_st_exp}"/config.yaml \
+                    ${inference_enh_config:+--inference_config "$inference_enh_config"} \
+                    --model_file "${enh_st_exp}"/"${inference_enh_st_model}" \
+                    --output_dir "${_logdir}"/output.JOB \
+                    ${_opts} ${enh_inference_args}
+
+            # 3. Concatenates the output files from each jobs
+            _spk_list=" "
+            for i in $(seq ${spk_num}); do
+                _spk_list+="spk${i} "
+            done
+
+            for spk in ${_spk_list}; do
+                for i in $(seq "${_nj}"); do
+                    cat "${_logdir}/output.${i}/${spk}.scp"
+                done | LC_ALL=C sort -k1 > "${_dir}/${spk}.scp"
+            done
+        done
+    fi
+
+
+    if [ ${stage} -le 14 ] && [ ${stop_stage} -ge 14 ]; then
+        log "Stage 14: Scoring Translation"
+
+        for dset in ${test_sets}; do
+            _data="${data_feats}/${dset}"
+            _dir="${enh_st_exp}/${inference_tag}/${dset}"
+
+            # TODO(jiatong): add asr scoring and inference
+
+            _scoredir="${_dir}/score_bleu"
+            mkdir -p "${_scoredir}"
+
+            paste \
+                <(<"${_data}/text.${tgt_case}.${tgt_lang}" \
+                    ${python} -m espnet2.bin.tokenize_text  \
+                        -f 2- --input - --output - \
+                        --token_type word \
+                        --non_linguistic_symbols "${nlsyms_txt}" \
+                        --remove_non_linguistic_symbols true \
+                        --cleaner "${cleaner}" \
+                        ) \
+                <(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \
+                    >"${_scoredir}/ref.trn.org"
+
+            # NOTE(kamo): Don't use cleaner for hyp
+            paste \
+                <(<"${_dir}/text"  \
+                        ${python} -m espnet2.bin.tokenize_text  \
+                            -f 2- --input - --output - \
+                            --token_type word \
+                            --non_linguistic_symbols "${nlsyms_txt}" \
+                            --remove_non_linguistic_symbols true \
+                            ) \
+                <(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \
+                    >"${_scoredir}/hyp.trn.org"
+
+            # remove utterance id
+            perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/ref.trn.org" > "${_scoredir}/ref.trn"
+            perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/hyp.trn.org" > "${_scoredir}/hyp.trn"
+
+            # detokenizer
+            detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/ref.trn" > "${_scoredir}/ref.trn.detok"
+            detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/hyp.trn" > "${_scoredir}/hyp.trn.detok"
+
+            if [ ${tgt_case} = "tc" ]; then
+                echo "Case sensitive BLEU result (single-reference)" >> ${_scoredir}/result.tc.txt
+                sacrebleu "${_scoredir}/ref.trn.detok" \
+                          -i "${_scoredir}/hyp.trn.detok" \
+                          -m bleu chrf ter \
+                          >> ${_scoredir}/result.tc.txt
+
+                log "Write a case-sensitive BLEU (single-reference) result in ${_scoredir}/result.tc.txt"
+            fi
+
+            # detokenize & remove punctuation except apostrophe
+            remove_punctuation.pl < "${_scoredir}/ref.trn.detok" > "${_scoredir}/ref.trn.detok.lc.rm"
+            remove_punctuation.pl < "${_scoredir}/hyp.trn.detok" > "${_scoredir}/hyp.trn.detok.lc.rm"
+            echo "Case insensitive BLEU result (single-reference)" >> ${_scoredir}/result.lc.txt
+            sacrebleu -lc "${_scoredir}/ref.trn.detok.lc.rm" \
+                      -i "${_scoredir}/hyp.trn.detok.lc.rm" \
+                      -m bleu chrf ter \
+                      >> ${_scoredir}/result.lc.txt
+            log "Write a case-insensitve BLEU (single-reference) result in ${_scoredir}/result.lc.txt"
+
+            # process multi-references cases
+            multi_references=$(ls "${_data}/text.${tgt_case}.${tgt_lang}".* || echo "")
+            if [ "${multi_references}" != "" ]; then
+                case_sensitive_refs=""
+                case_insensitive_refs=""
+                for multi_reference in ${multi_references}; do
+                    ref_idx="${multi_reference##*.}"
+                    paste \
+                        <(<${multi_reference} \
+                            ${python} -m espnet2.bin.tokenize_text  \
+                                -f 2- --input - --output - \
+                                --token_type word \
+                                --non_linguistic_symbols "${nlsyms_txt}" \
+                                --remove_non_linguistic_symbols true \
+                                --cleaner "${cleaner}" \
+                                ) \
+                        <(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \
+                            >"${_scoredir}/ref.trn.org.${ref_idx}"
+
+                    perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/ref.trn.org.${ref_idx}" > "${_scoredir}/ref.trn.${ref_idx}"
+                    detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/ref.trn.${ref_idx}" > "${_scoredir}/ref.trn.detok.${ref_idx}"
+                    remove_punctuation.pl < "${_scoredir}/ref.trn.detok.${ref_idx}" > "${_scoredir}/ref.trn.detok.lc.rm.${ref_idx}"
+                    case_sensitive_refs="${case_sensitive_refs} ${_scoredir}/ref.trn.detok.${ref_idx}"
+                    case_insensitive_refs="${case_insensitive_refs} ${_scoredir}/ref.trn.detok.lc.rm.${ref_idx}"
+                done
+
+                if [ ${tgt_case} = "tc" ]; then
+                    echo "Case sensitive BLEU result (multi-references)" >> ${_scoredir}/result.tc.txt
+                    sacrebleu ${case_sensitive_refs} \
+                        -i ${_scoredir}/hyp.trn.detok.lc.rm -m bleu chrf ter \
+                        >> ${_scoredir}/result.tc.txt
+                    log "Write a case-sensitve BLEU (multi-reference) result in ${_scoredir}/result.tc.txt"
+                fi
+
+                echo "Case insensitive BLEU result (multi-references)" >> ${_scoredir}/result.lc.txt
+                sacrebleu -lc ${case_insensitive_refs} \
+                    -i ${_scoredir}/hyp.trn.detok.lc.rm -m bleu chrf ter \
+                    >> ${_scoredir}/result.lc.txt
+                log "Write a case-insensitve BLEU (multi-reference) result in ${_scoredir}/result.lc.txt"
+            fi
+        done
+
+        # Show results in Markdown syntax
+        scripts/utils/show_translation_result.sh --case $tgt_case "${enh_st_exp}" > "${enh_st_exp}"/RESULTS.md
+        cat "${enh_st_exp}"/RESULTS.md
+
+    fi
+
+    if [ ${stage} -le 15 ] && [ ${stop_stage} -ge 15 ]; then
+        log "Stage 15: Scoring Enhancement"
+        _cmd=${decode_cmd}
+
+        # score_obs=true: Scoring for observation signal
+        # score_obs=false: Scoring for enhanced signal
+        # for score_obs in true false; do
+        for score_obs in true false; do
+            # Peform only at the first time for observation
+            if "${score_obs}" && [ -e "${data_feats}/RESULTS_enh.md" ]; then
+                log "${data_feats}/RESULTS_enh.md already exists. The scoring for observation will be skipped"
+                continue
+            fi
+
+            for dset in ${test_sets}; do
+                _data="${data_feats}/${dset}"
+                if "${score_obs}"; then
+                    _dir="${data_feats}/${dset}/scoring"
+                else
+                    _dir="${enh_st_exp}/${inference_enh_tag}_${dset}/scoring"
+                fi
+
+                _logdir="${_dir}/logdir"
+                mkdir -p "${_logdir}"
+
+                # 1. Split the key file
+                key_file=${_data}/wav.scp
+                split_scps=""
+                _nj=$(min "${inference_nj}" "$(<${key_file} wc -l)")
+                for n in $(seq "${_nj}"); do
+                    split_scps+=" ${_logdir}/keys.${n}.scp"
+                done
+                # shellcheck disable=SC2086
+                utils/split_scp.pl "${key_file}" ${split_scps}
+
+                _ref_scp=
+                for spk in $(seq "${spk_num}"); do
+                    _ref_scp+="--ref_scp ${_data}/spk${spk}.scp "
+                done
+                _inf_scp=
+                for spk in $(seq "${spk_num}"); do
+                    if "${score_obs}"; then
+                        # To compute the score of observation, input original wav.scp
+                        _inf_scp+="--inf_scp ${data_feats}/${dset}/wav.scp "
+                    else
+                        _inf_scp+="--inf_scp ${enh_st_exp}/${inference_enh_tag}_${dset}/spk${spk}.scp "
+                    fi
+                done
+
+                # 2. Submit scoring jobs
+                log "Scoring started... log: '${_logdir}/enh_scoring.*.log'"
+                # shellcheck disable=SC2086
+                ${_cmd} JOB=1:"${_nj}" "${_logdir}"/enh_scoring.JOB.log \
+                    ${python} -m espnet2.bin.enh_scoring \
+                        --key_file "${_logdir}"/keys.JOB.scp \
+                        --output_dir "${_logdir}"/output.JOB \
+                        ${_ref_scp} \
+                        ${_inf_scp} \
+                        --ref_channel ${ref_channel}
+
+                for spk in $(seq "${spk_num}"); do
+                    for protocol in ${scoring_protocol} wav; do
+                        for i in $(seq "${_nj}"); do
+                            cat "${_logdir}/output.${i}/${protocol}_spk${spk}"
+                        done | LC_ALL=C sort -k1 > "${_dir}/${protocol}_spk${spk}"
+                    done
+                done
+
+
+                for protocol in ${scoring_protocol}; do
+                    # shellcheck disable=SC2046
+                    paste $(for j in $(seq ${spk_num}); do echo "${_dir}"/"${protocol}"_spk"${j}" ; done)  |
+                    awk 'BEGIN{sum=0}
+                        {n=0;score=0;for (i=2; i<=NF; i+=2){n+=1;score+=$i}; sum+=score/n}
+                        END{printf ("%.2f\n",sum/NR)}' > "${_dir}/result_${protocol,,}.txt"
+                done
+            done
+
+            ./scripts/utils/show_enh_score.sh "${_dir}/../.." > "${_dir}/../../RESULTS_enh.md"
+        done
+        log "Evaluation result for observation: ${data_feats}/RESULTS_enh.md"
+        log "Evaluation result for enhancement: ${enh_st_exp}/RESULTS_enh.md"
+
+    fi
+else
+    log "Skip the evaluation stages"
+fi
+
+
+packed_model="${enh_st_exp}/${enh_st_exp##*/}_${inference_enh_st_model%.*}.zip"
+if ! "${skip_upload_hf}"; then
+    if [ ${stage} -le 16 ] && [ ${stop_stage} -ge 16 ]; then
+        log "Stage 16: Pack model: ${packed_model}"
+
+        _opts=
+        if "${use_lm}"; then
+            _opts+="--lm_train_config ${lm_exp}/config.yaml "
+            _opts+="--lm_file ${lm_exp}/${inference_lm} "
+            _opts+="--option ${lm_exp}/perplexity_test/ppl "
+            _opts+="--option ${lm_exp}/images "
+        fi
+        if [ "${feats_normalize}" = global_mvn ]; then
+            _opts+="--option ${enh_st_stats_dir}/train/feats_stats.npz "
+        fi
+        if [ "${tgt_token_type}" = bpe ]; then
+            _opts+="--option ${tgt_bpemodel} "
+            _opts+="--option ${src_bpemodel} "
+        fi
+        if [ "${nlsyms_txt}" != none ]; then
+            _opts+="--option ${nlsyms_txt} "
+        fi
+        # shellcheck disable=SC2086
+        ${python} -m espnet2.bin.pack enh_s2t \
+            --enh_s2t_train_config "${enh_st_exp}"/config.yaml \
+            --enh_s2t_model_file "${enh_st_exp}"/"${inference_enh_st_model}" \
+            ${_opts} \
+            --option "${enh_st_exp}"/RESULTS.md \
+            --option "${enh_st_exp}"/RESULTS_enh.md \
+            --option "${enh_st_exp}"/images \
+            --outpath "${packed_model}"
+    fi
+
+
+    if [ ${stage} -le 17 ] && [ ${stop_stage} -ge 17 ]; then
+        [ -z "${hf_repo}" ] && \
+            log "ERROR: You need to setup the variable hf_repo with the name of the repository located at HuggingFace" && \
+            exit 1
+        log "Stage 17: Upload model to HuggingFace: ${hf_repo}"
+
+        gitlfs=$(git lfs --version 2> /dev/null || true)
+        [ -z "${gitlfs}" ] && \
+            log "ERROR: You need to install git-lfs first" && \
+            exit 1             
+  
+        dir_repo=${expdir}/hf_${hf_repo//"/"/"_"}
+        [ ! -d "${dir_repo}" ] && git clone https://huggingface.co/${hf_repo} ${dir_repo}
+  
+        if command -v git &> /dev/null; then
+            _creator_name="$(git config user.name)"
+            _checkout="git checkout $(git show -s --format=%H)"
+        else
+            _creator_name="$(whoami)"
+            _checkout=""
+        fi
+        # /some/where/espnet/egs2/foo/asr1/ -> foo/asr1
+        _task="$(pwd | rev | cut -d/ -f2 | rev)"
+        # foo/asr1 -> foo
+        _corpus="${_task%/*}"
+        _model_name="${_creator_name}/${_corpus}_$(basename ${packed_model} .zip)"
+  
+        # copy files in ${dir_repo}
+        unzip -o ${packed_model} -d ${dir_repo}
+        # Generate description file
+        # shellcheck disable=SC2034
+        hf_task=speech-enhancement-translation
+        # shellcheck disable=SC2034     
+        espnet_task=EnhS2T
+        # shellcheck disable=SC2034
+        task_exp=${enh_st_exp}
+        eval "echo \"$(cat scripts/utils/TEMPLATE_HF_Readme.md)\"" > "${dir_repo}"/README.md
+
+        this_folder=${PWD}
+        cd ${dir_repo}
+        if [ -n "$(git status --porcelain)" ]; then
+            git add .
+            git commit -m "Update model"
+        fi
+        git push
+        cd ${this_folder}
+    fi
+else
+    log "Skip the uploading stages"
+fi
+
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/TEMPLATE/enh_st1/local/path.sh b/egs2/TEMPLATE/enh_st1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/TEMPLATE/enh_st1/path.sh b/egs2/TEMPLATE/enh_st1/path.sh
new file mode 100755
index 00000000000..d2b90a67653
--- /dev/null
+++ b/egs2/TEMPLATE/enh_st1/path.sh
@@ -0,0 +1,22 @@
+MAIN_ROOT=$PWD/../../..
+KALDI_ROOT=$MAIN_ROOT/tools/kaldi
+
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+
+. "${MAIN_ROOT}"/tools/activate_python.sh && . "${MAIN_ROOT}"/tools/extra_path.sh
+export PATH=$MAIN_ROOT/utils:$MAIN_ROOT/espnet/bin:$PATH
+
+export OMP_NUM_THREADS=1
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+
+# You need to change or unset NCCL_SOCKET_IFNAME according to your network environment
+# https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/env.html#nccl-socket-ifname
+export NCCL_SOCKET_IFNAME="^lo,docker,virbr,vmnet,vboxnet"
+
+# NOTE(kamo): Source at the last to overwrite the setting
+. local/path.sh
diff --git a/egs2/TEMPLATE/enh_st1/pyscripts b/egs2/TEMPLATE/enh_st1/pyscripts
new file mode 120000
index 00000000000..90e7cf60b04
--- /dev/null
+++ b/egs2/TEMPLATE/enh_st1/pyscripts
@@ -0,0 +1 @@
+../asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_st1/scripts b/egs2/TEMPLATE/enh_st1/scripts
new file mode 120000
index 00000000000..b2cb12d9a6c
--- /dev/null
+++ b/egs2/TEMPLATE/enh_st1/scripts
@@ -0,0 +1 @@
+../enh_asr1/scripts
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_st1/setup.sh b/egs2/TEMPLATE/enh_st1/setup.sh
new file mode 100755
index 00000000000..b69c326c340
--- /dev/null
+++ b/egs2/TEMPLATE/enh_st1/setup.sh
@@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+help_message=$(cat << EOF
+Usage: $0 <target-dir>
+EOF
+)
+
+
+if [ $# -ne 1 ]; then
+    log "${help_message}"
+    log "Error: 1 positional argument is required."
+    exit 2
+fi
+
+
+dir=$1
+mkdir -p "${dir}"
+
+if [ ! -d "${dir}"/../../TEMPLATE ]; then
+    log "Error: ${dir}/../../TEMPLATE should exist. You may specify wrong directory."
+    exit 1
+fi
+
+targets=""
+
+# Copy
+for f in cmd.sh conf local; do
+    target="${dir}"/../../TEMPLATE/enh_st1/"${f}"
+    cp -r "${target}" "${dir}"
+    targets+="${dir}/${target} "
+done
+
+
+# Symlinks to TEMPLATE
+for f in st.sh path.sh db.sh scripts pyscripts; do
+    target=../../TEMPLATE/enh_st1/"${f}"
+    ln -sf "${target}" "${dir}"
+    targets+="${dir}/${target} "
+done
+
+
+# Symlinks to Kaldi
+for f in steps utils; do
+    target=../../../tools/kaldi/egs/wsj/s5/"${f}"
+    ln -sf "${target}" "${dir}"
+    targets+="${dir}/${target} "
+done
+
+log "Created: ${targets}"
diff --git a/egs2/TEMPLATE/enh_st1/steps b/egs2/TEMPLATE/enh_st1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/TEMPLATE/enh_st1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh_st1/utils b/egs2/TEMPLATE/enh_st1/utils
new file mode 120000
index 00000000000..6d93948f170
--- /dev/null
+++ b/egs2/TEMPLATE/enh_st1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils/
\ No newline at end of file
diff --git a/egs2/TEMPLATE/mt1/mt.sh b/egs2/TEMPLATE/mt1/mt.sh
index 35c6ab276c3..587b4ebf534 100755
--- a/egs2/TEMPLATE/mt1/mt.sh
+++ b/egs2/TEMPLATE/mt1/mt.sh
@@ -299,7 +299,7 @@ if "${token_joint}"; then
     src_bpetoken_list="${tgt_bpetoken_list}"
     src_chartoken_list="${tgt_chartoken_list}"
 else
-    src_bpedir="${token_listdir}/src_bpe_${tgt_bpemode}${tgt_nbpe}"
+    src_bpedir="${token_listdir}/src_bpe_${src_bpemode}${src_nbpe}"
     src_bpeprefix="${src_bpedir}"/bpe
     src_bpemodel="${src_bpeprefix}".model
     src_bpetoken_list="${src_bpedir}"/tokens.txt
diff --git a/egs2/TEMPLATE/st1/st.sh b/egs2/TEMPLATE/st1/st.sh
index 9867f341f88..18303210f87 100755
--- a/egs2/TEMPLATE/st1/st.sh
+++ b/egs2/TEMPLATE/st1/st.sh
@@ -111,6 +111,7 @@ hf_repo=
 
 # Decoding related
 use_k2=false      # Whether to use k2 based decoder
+use_streaming=false # Whether to use streaming decoding
 batch_size=1
 inference_tag=    # Suffix to the result dir for decoding.
 inference_config= # Config for decoding.
@@ -325,7 +326,7 @@ if "${token_joint}"; then
     src_bpetoken_list="${tgt_bpetoken_list}"
     src_chartoken_list="${tgt_chartoken_list}"
 else
-    src_bpedir="${token_listdir}/src_bpe_${tgt_bpemode}${tgt_nbpe}"
+    src_bpedir="${token_listdir}/src_bpe_${src_bpemode}${src_nbpe}"
     src_bpeprefix="${src_bpedir}"/bpe
     src_bpemodel="${src_bpeprefix}".model
     src_bpetoken_list="${src_bpedir}"/tokens.txt
@@ -618,7 +619,7 @@ if ! "${skip_data_prep}"; then
 
         elif  [ "${feats_type}" = extracted ]; then
             log "Stage 3: ${feats_type} extract: data/ -> ${data_feats}"
-            # Assumming you don't have wav.scp, but feats.scp is created by local/data.sh instead.
+            # Assuming you don't have wav.scp, but feats.scp is created by local/data.sh instead.
 
             for dset in "${train_set}" "${valid_set}" ${test_sets}; do
                 if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
@@ -1412,7 +1413,11 @@ if ! "${skip_eval}"; then
             key_file=${_data}/${_scp}
             split_scps=""
             _nj=$(min "${inference_nj}" "$(<${key_file} wc -l)")
-            st_inference_tool="espnet2.bin.st_inference"
+            if "${use_streaming}"; then
+                st_inference_tool="espnet2.bin.st_inference_streaming"
+            else
+                st_inference_tool="espnet2.bin.st_inference"
+            fi
 
             for n in $(seq "${_nj}"); do
                 split_scps+=" ${_logdir}/keys.${n}.scp"
@@ -1552,7 +1557,7 @@ if ! "${skip_eval}"; then
 
         # Show results in Markdown syntax
         scripts/utils/show_translation_result.sh --case $tgt_case "${st_exp}" > "${st_exp}"/RESULTS.md
-        cat "${cat_exp}"/RESULTS.md
+        cat "${st_exp}"/RESULTS.md
     fi
 else
     log "Skip the evaluation stages"
diff --git a/egs2/accented_french_openslr57/asr1/README.md b/egs2/accented_french_openslr57/asr1/README.md
new file mode 100644
index 00000000000..da3966b0197
--- /dev/null
+++ b/egs2/accented_french_openslr57/asr1/README.md
@@ -0,0 +1,36 @@
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+# RESULTS
+## Environments
+- date: `Sat Apr 16 14:14:45 EDT 2022`
+- python version: `3.9.12 (main, Apr  5 2022, 06:56:58)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.6a1`
+- pytorch version: `pytorch 1.11.0+cu102`
+- Git hash: `f6cbc61353e0a1cefe81ae596278f7db1f0b7dd9`
+  - Commit date: `Fri Apr 15 18:31:26 2022 -0400`
+- Model on HuggingFace repository : https://huggingface.co/espnet/accented_french_openslr57_ASR_transformer
+
+## asr_transformer_baseline
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave_10best/devtest|481|3172|97.4|1.6|1.0|0.2|2.8|15.0|
+|decode_asr_asr_model_valid.acc.ave_10best/test|515|2941|85.2|13.4|1.3|9.1|23.9|58.4|
+
+Results are better for the devtest set as the sets are composed of very different accents. 
+Indeed, devtest is only made of Gbon accent while the test set is a mix of Gabon and Cameroon.
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave_10best/devtest|481|16205|98.7|0.2|1.1|0.2|1.5|15.0|
+|decode_asr_asr_model_valid.acc.ave_10best/test|515|16233|95.8|2.0|2.2|2.1|6.3|58.4|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave_10best/devtest|481|7555|98.1|0.6|1.4|0.3|2.2|15.0|
+|decode_asr_asr_model_valid.acc.ave_10best/test|515|7998|88.9|6.7|4.5|1.3|12.4|58.4|
+
diff --git a/egs2/accented_french_openslr57/asr1/asr.sh b/egs2/accented_french_openslr57/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/accented_french_openslr57/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/accented_french_openslr57/asr1/cmd.sh b/egs2/accented_french_openslr57/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/accented_french_openslr57/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/accented_french_openslr57/asr1/conf/decode_asr.yaml b/egs2/accented_french_openslr57/asr1/conf/decode_asr.yaml
new file mode 100644
index 00000000000..f8b1f2bbc4d
--- /dev/null
+++ b/egs2/accented_french_openslr57/asr1/conf/decode_asr.yaml
@@ -0,0 +1,6 @@
+lm_weight: 1.0
+beam_size: 20
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.3
diff --git a/egs2/accented_french_openslr57/asr1/conf/fbank.conf b/egs2/accented_french_openslr57/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/accented_french_openslr57/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/accented_french_openslr57/asr1/conf/pbs.conf b/egs2/accented_french_openslr57/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/accented_french_openslr57/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/accented_french_openslr57/asr1/conf/pitch.conf b/egs2/accented_french_openslr57/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/accented_french_openslr57/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/accented_french_openslr57/asr1/conf/queue.conf b/egs2/accented_french_openslr57/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/accented_french_openslr57/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/accented_french_openslr57/asr1/conf/slurm.conf b/egs2/accented_french_openslr57/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/accented_french_openslr57/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/accented_french_openslr57/asr1/conf/train_lm_transformer.yaml b/egs2/accented_french_openslr57/asr1/conf/train_lm_transformer.yaml
new file mode 100644
index 00000000000..ace0739a939
--- /dev/null
+++ b/egs2/accented_french_openslr57/asr1/conf/train_lm_transformer.yaml
@@ -0,0 +1,29 @@
+lm: transformer
+lm_conf:
+    pos_enc: null
+    embed_unit: 128
+    att_unit: 512
+    head: 8
+    unit: 2048
+    layer: 16
+    dropout_rate: 0.1
+
+# optimization related
+grad_clip: 5.0
+batch_type: numel
+batch_bins: 2000000
+accum_grad: 1
+max_epoch: 15  # 15epoch is enougth
+
+optim: adam
+optim_conf:
+   lr: 0.001
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 25000
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 10  # 10 is good.
diff --git a/egs2/accented_french_openslr57/asr1/conf/tuning/train_asr_hubert_frontend.yaml b/egs2/accented_french_openslr57/asr1/conf/tuning/train_asr_hubert_frontend.yaml
new file mode 100644
index 00000000000..f7879a91459
--- /dev/null
+++ b/egs2/accented_french_openslr57/asr1/conf/tuning/train_asr_hubert_frontend.yaml
@@ -0,0 +1,80 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: "conv2d2"
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+
+# decoder related
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: "embed"
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+
+frontend: s3prl
+frontend_conf:
+  frontend_conf:
+      upstream: hubert_large_ll60k  # Note: If the upstream is changed, please change the input_size in the preencoder  
+  download_dir: ./hub
+  multilayer_feature: True
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 80
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# minibatch related
+batch_type: folded
+batch_size: 16
+
+# optimization related
+optim: adam
+accum_grad: 4
+grad_clip: 5
+patience: 15
+max_epoch: 100
+optim_conf:
+    lr: 1
+scheduler: noamlr
+scheduler_conf:
+    warmup_steps: 4000
+
+# others:
+best_model_criterion:
+-  - valid
+   - acc
+   - max
+keep_nbest_models: 10
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+init: xavier_uniform
diff --git a/egs2/accented_french_openslr57/asr1/conf/tuning/train_asr_transformer.yaml b/egs2/accented_french_openslr57/asr1/conf/tuning/train_asr_transformer.yaml
new file mode 100644
index 00000000000..43b6772edb3
--- /dev/null
+++ b/egs2/accented_french_openslr57/asr1/conf/tuning/train_asr_transformer.yaml
@@ -0,0 +1,67 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: "conv2d2"
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+
+# decoder related
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: "embed"
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# minibatch related
+batch_type: folded
+batch_size: 8
+
+# optimization related
+optim: adam
+accum_grad: 1
+grad_clip: 5
+patience: 15
+max_epoch: 100
+optim_conf:
+    lr: 1
+scheduler: noamlr
+scheduler_conf:
+    warmup_steps: 4000
+
+# others:
+best_model_criterion:
+-  - valid
+   - acc
+   - max
+keep_nbest_models: 10
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+init: xavier_uniform
diff --git a/egs2/accented_french_openslr57/asr1/db.sh b/egs2/accented_french_openslr57/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/accented_french_openslr57/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/accented_french_openslr57/asr1/local/data.sh b/egs2/accented_french_openslr57/asr1/local/data.sh
new file mode 100755
index 00000000000..950ae6a355d
--- /dev/null
+++ b/egs2/accented_french_openslr57/asr1/local/data.sh
@@ -0,0 +1,245 @@
+#!/bin/bash
+
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+# general configuration
+stage=0       # start from 0 if you need to start from data preparation
+stop_stage=100
+# inclusive, was 100
+SECONDS=0
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+. utils/parse_options.sh
+
+mkdir -p ${ACCENTED_FR}
+if [ -z "${ACCENTED_FR}" ]; then
+    log "Fill the value of 'ACCENTED_FR' of db.sh"
+    exit 1
+fi
+
+log "data preparation started"
+
+
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "sub-stage 0: Download Data to downloads"
+
+    (
+    cd ${ACCENTED_FR}
+    wget https://www.openslr.org/resources/57/African_Accented_French.tar.gz
+    tar -xvf African_Accented_French.tar.gz
+    rm -r African_Accented_French.tar.gz
+    )
+
+fi
+
+# some samples are missing (less than 0.1%), we use the following scripts to clean the datasets
+python3 local/normalize_test.py --path_test "${ACCENTED_FR}/African_Accented_French/transcripts/test/ca16/"
+python3 local/remove_missing.py --folder "downloads/African_Accented_French/" --train "transcripts/train/" --devtest "transcripts/devtest/"
+
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "sub-stage 1: Preparing Data for train"
+
+    # train set ca16_conv: 
+    FILE=${ACCENTED_FR}/African_Accented_French/transcripts/train/ca16_conv/new_transcripts.txt
+
+    # .split('_') starts at 1, put in aux file, aux is the folder
+    cut -d '_' -f 3 "$FILE" > aux5
+    cut -d ' ' -f 1 "$FILE" > uttid1
+    cut -c -32 uttid1 > uttid2
+    cut -d ' ' -f 2- "$FILE" > aux10
+
+    paste -d ' ' uttid2 aux10 > auxtext1
+
+    # take everything in aux $0, add that before to the $0 -> aux2
+    awk '{print "downloads/African_Accented_French/speech/train/ca16/"$0}' aux5 > aux6
+
+    # aux2/uttid -> aux3
+    paste -d "/"  aux6 uttid2 > aux7
+    awk '{print $0".wav"}' aux7 > aux8
+    paste  -d " " uttid2 aux8 > auxwav1
+
+    # identity function
+    paste  -d " " uttid2 uttid2  > auxutt1
+
+    # train set ca16_conv: 
+    FILE=${ACCENTED_FR}/African_Accented_French/transcripts/train/ca16_read/new_conditioned.txt
+
+    # .split('_') starts at 1, put in aux file, aux is the folder
+    cut -d '_' -f 3 "$FILE" > aux5
+    cut -d ' ' -f 1 "$FILE" > uttid1
+
+    # take everything in aux $0, add that before to the $0 -> aux2
+    awk '{print "downloads/African_Accented_French/speech/train/ca16/"$0}' aux5 > aux6
+
+    # aux2/uttid -> aux3
+    paste -d "/"  aux6 uttid1 > aux7
+    awk '{print $0".wav"}' aux7 > aux8
+    paste  -d " " uttid1 aux8 > auxwav2
+
+    # identity function
+    paste  -d " " uttid1 uttid1 > auxutt2
+
+    # train yaounde read
+    head -6299 ${ACCENTED_FR}/African_Accented_French/transcripts/train/yaounde/fn_text.txt > d_aux
+    tail -n +2 d_aux > ${ACCENTED_FR}/African_Accented_French/transcripts/train/yaounde/fn_read_text.txt
+    FILE=${ACCENTED_FR}/African_Accented_French/transcripts/train/yaounde/fn_read_text.txt
+
+    # .split('_') starts at 1, put in aux file, aux is the folder
+    cut -d '/' -f 9 "$FILE" > aux5
+    cut -d ' ' -f 1 aux5 > aux6
+
+    cut -d '/' -f 8-9 "$FILE" > aux9
+    cut -d ' ' -f 1 aux9 > aux10
+
+    cut -c -14 aux6 > aux7
+    awk '{print "read-"$0}' aux7 > uttid3
+
+    cut -d ' ' -f 2- "$FILE" > aux8
+    paste -d ' ' uttid3 aux8 > auxtext3
+
+    awk '{print "downloads/African_Accented_French/speech/train/yaounde/read/"$0}' aux10 > aux11
+    paste -d ' ' uttid3 aux11 > auxwav3
+
+    paste -d ' ' uttid3 uttid3 > auxutt3
+
+    # train yaounde answers
+    tail -2098 ${ACCENTED_FR}/African_Accented_French/transcripts/train/yaounde/fn_text.txt >  ${ACCENTED_FR}/African_Accented_French/transcripts/train/yaounde/fn_answers_text.txt
+    FILE=${ACCENTED_FR}/African_Accented_French/transcripts/train/yaounde/fn_answers_text.txt
+
+    # .split('_') starts at 1, put in aux file, aux is the folder
+    cut -d '/' -f 8 "$FILE" > aux5
+    cut -d ' ' -f 1 aux5 > aux6
+
+    cut -d '/' -f 7-8 "$FILE" > aux9
+    cut -d ' ' -f 1 aux9 > aux10
+
+    cut -c -13 aux6 > aux7
+    awk '{print "answers-"$0}' aux7 > uttid4
+
+    cut -d ' ' -f 2- "$FILE" > aux8
+    paste -d ' ' uttid4 aux8 > auxtext4
+
+    awk '{print "downloads/African_Accented_French/speech/train/yaounde/answers/"$0}' aux10 > aux11
+    paste -d ' ' uttid4 aux11 > auxwav4
+
+    paste -d ' ' uttid4 uttid4 > auxutt4
+
+    # cat everything
+    mkdir -p data/train
+
+    cat auxtext1 ${ACCENTED_FR}/African_Accented_French/transcripts/train/ca16_read/new_conditioned.txt auxtext3 auxtext4 > data/train/text
+    cat auxwav1 auxwav2 auxwav3 auxwav4 > data/train/wav.scp
+    cat auxutt1 auxutt2 auxutt3 auxutt4 > data/train/utt2spk
+    utils/utt2spk_to_spk2utt.pl data/train/utt2spk > data/train/spk2utt
+
+    ./utils/fix_data_dir.sh data/train/
+
+    rm d_aux aux5 aux6 aux7 aux8 aux9 aux10 aux11 auxtext1 auxtext3 auxtext4 auxwav1 auxwav2 auxwav3 auxwav4 auxutt1 auxutt2 auxutt3 auxutt4 uttid1 uttid2 uttid3 uttid4
+fi
+
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "sub-stage 2: Preparing Data for dev"
+    mkdir -p data/dev
+    
+    FILE=${ACCENTED_FR}/African_Accented_French/transcripts/dev/niger_west_african_fr/transcripts.txt
+
+    cut -d '/' -f 3 "$FILE" > aux5
+    cut -d ' ' -f 1 aux5 > aux6
+
+    cut -d '/' -f 2-3 "$FILE" > aux9 
+    cut -d ' ' -f 1 aux9 > aux10
+
+    cut -c -16 aux6 > uttid
+
+    cut -d ' ' -f 2- "$FILE" > aux8
+    paste -d ' ' uttid aux8 > data/dev/text 
+
+    awk '{print "downloads/African_Accented_French/speech/dev/niger_west_african_fr/"$0}' aux10 > aux11
+    paste -d ' ' uttid aux11 > data/dev/wav.scp 
+
+    paste -d ' ' uttid uttid > data/dev/utt2spk
+
+    utils/utt2spk_to_spk2utt.pl data/dev/utt2spk > data/dev/spk2utt
+
+    ./utils/fix_data_dir.sh data/dev/
+
+    rm aux5 aux6 aux9 aux10 uttid aux8 aux11
+
+fi
+
+# test; normalization of the test set is done in normalize_test.py
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    log "sub-stage 3: Preparing Data for test"
+    mkdir -p data/test
+    FILE=${ACCENTED_FR}/African_Accented_French/transcripts/test/ca16/new_prompts.txt
+
+    cp "$FILE" data/test/text
+
+    cut -d ' ' -f 1 "$FILE" > aux5
+    cut -d '_' -f 1-3 aux5 > aux6
+
+    awk '{print "downloads/African_Accented_French/speech/test/ca16/"$0"/"}' aux6 > aux7
+    paste -d '' aux7 aux5 > aux8
+    awk '{print $0".wav"}' aux8 > aux9
+    paste -d ' ' aux5 aux9 > data/test/wav.scp
+
+    paste -d ' ' aux5 aux5 > data/test/utt2spk
+
+    utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt
+
+    ./utils/fix_data_dir.sh data/test/
+
+    rm aux5 aux6 aux7 aux8 aux9
+
+fi
+
+
+# devtest
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    log "sub-stage 4: Preparing Data for devtest"
+    mkdir -p data/devtest
+
+    FILE=${ACCENTED_FR}/African_Accented_French/transcripts/devtest/ca16_read/new_conditioned.txt
+
+    # .split('_') starts at 1, put in aux file, aux is the folder
+    cut -d '_' -f 3 "$FILE" > aux
+    cut -d ' ' -f 1 "$FILE" > uttid
+
+    # take everything in aux $0, add that before to the $0 -> aux2
+    awk '{print "downloads/African_Accented_French/speech/devtest/ca16/"$0}' aux > aux2
+
+    # aux2/uttid -> aux3
+    paste -d "/"  aux2 uttid > aux3
+    awk '{print $0".wav"}' aux3 > aux4
+    paste  -d " " uttid aux4 > data/devtest/wav.scp
+
+    cp ${ACCENTED_FR}/African_Accented_French/transcripts/devtest/ca16_read/new_conditioned.txt data/devtest/text
+
+    # identity function
+    paste  -d " " uttid uttid  > data/devtest/utt2spk
+    utils/utt2spk_to_spk2utt.pl data/devtest/utt2spk > data/devtest/spk2utt
+
+    ./utils/fix_data_dir.sh data/devtest/
+    rm aux aux2 aux3 aux4 uttid
+fi
+
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
\ No newline at end of file
diff --git a/egs2/accented_french_openslr57/asr1/local/normalize_test.py b/egs2/accented_french_openslr57/asr1/local/normalize_test.py
new file mode 100644
index 00000000000..6b13080c912
--- /dev/null
+++ b/egs2/accented_french_openslr57/asr1/local/normalize_test.py
@@ -0,0 +1,38 @@
+# This script is normalizing the test set :
+# It removes punctuation as it is not needed for ASR task
+# It also removes capital letters.
+# This is not the case for the other sets (train, dev and devtest)
+# Those sets are already normalized
+
+# Example :
+# Before : Oui, qu'est-ce que vous voulez?
+# After : oui qu'est-ce que vous voulez
+
+import argparse
+
+parser = argparse.ArgumentParser(description="Normalize test text.")
+parser.add_argument("--path_test", type=str, help="path of test text file")
+
+
+def main(cmd=None):
+    args = parser.parse_args(cmd)
+
+    path = args.path_test
+    f = open(path + "prompts.txt")
+
+    new_f = open(path + "new_prompts.txt", "w")
+
+    for row in f:
+        uttid = row.split(" ")[0]
+        utt = " ".join(row.split(" ")[1:])
+        utt = utt.split("\n")[0]
+        utt = utt.lower()
+        utt = utt.strip(".?!")
+        utt = utt.replace(",", "")
+        utt = utt.replace(";", "")
+
+        new_f.write(uttid + " " + utt + "\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs2/accented_french_openslr57/asr1/local/path.sh b/egs2/accented_french_openslr57/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/accented_french_openslr57/asr1/local/remove_missing.py b/egs2/accented_french_openslr57/asr1/local/remove_missing.py
new file mode 100644
index 00000000000..937144f75d8
--- /dev/null
+++ b/egs2/accented_french_openslr57/asr1/local/remove_missing.py
@@ -0,0 +1,63 @@
+# This script removes very few examples (less than 0.1%) of the train and devtest sets
+# Those few examples contained corrupted utterance Id or empty transcripts.
+
+import argparse
+import os
+
+
+parser = argparse.ArgumentParser(description="Normalize test text.")
+parser.add_argument("--folder", type=str, help="path of download folder")
+parser.add_argument("--train", type=str, help="path of train folder")
+parser.add_argument("--devtest", type=str, help="path of devtest folder")
+
+
+def main(cmd=None):
+    args = parser.parse_args(cmd)
+
+    base = args.folder
+    train = args.train
+    devtest = args.devtest
+
+    existing = []
+    for _, _, f in os.walk(base + "speech/train/ca16"):
+        for fi in f:
+            existing.append(fi[:-4])
+
+    old_f = open(base + train + "ca16_conv/transcripts.txt")
+    new_f = open(
+        base + train + "ca16_conv/new_transcripts.txt",
+        "w",
+    )
+
+    for row in old_f:
+        if row.split(" ")[0][:-4] in existing:
+            new_f.write(row)
+
+    old_f = open(base + train + "ca16_read/conditioned.txt")
+    new_f = open(
+        base + train + "ca16_read/new_conditioned.txt",
+        "w",
+    )
+
+    for row in old_f:
+        if row.split(" ")[0] in existing:
+            new_f.write(row)
+
+    existing = []
+    for _, _, f in os.walk(base + "speech/devtest/ca16"):
+        for fi in f:
+            existing.append(fi[:-4])
+
+    old_f = open(base + devtest + "ca16_read/conditioned.txt")
+    new_f = open(
+        base + devtest + "ca16_read/new_conditioned.txt",
+        "w",
+    )
+
+    for row in old_f:
+        if row.split(" ")[0] in existing:
+            new_f.write(row)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs2/accented_french_openslr57/asr1/path.sh b/egs2/accented_french_openslr57/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/accented_french_openslr57/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/accented_french_openslr57/asr1/pyscripts b/egs2/accented_french_openslr57/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/accented_french_openslr57/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/accented_french_openslr57/asr1/run.sh b/egs2/accented_french_openslr57/asr1/run.sh
new file mode 100755
index 00000000000..d7b381bcfbc
--- /dev/null
+++ b/egs2/accented_french_openslr57/asr1/run.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+
+asr_config=conf/tuning/train_asr_transformer.yaml
+
+inference_config=conf/decode_asr.yaml
+lm_config=conf/train_lm_transformer.yaml
+
+./asr.sh \
+    --ngpu 1 \
+    --use_lm false \
+    --lm_config "${lm_config}" \
+    --token_type bpe \
+    --inference_nj 10 \
+    --nbpe 250 \
+    --nj 32 \
+    --feats_type raw \
+    --inference_asr_model "valid.acc.ave_10best.pth" \
+    --feats_normalize "utterance_mvn" \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "train" \
+    --valid_set "dev" \
+    --test_sets "devtest test" \
+    --bpe_train_text "data/train/text" \
+    --lm_train_text "data/train/text" "$@"
\ No newline at end of file
diff --git a/egs2/accented_french_openslr57/asr1/scripts b/egs2/accented_french_openslr57/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/accented_french_openslr57/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/accented_french_openslr57/asr1/steps b/egs2/accented_french_openslr57/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/accented_french_openslr57/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/accented_french_openslr57/asr1/utils b/egs2/accented_french_openslr57/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/accented_french_openslr57/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/aishell3/tts1/local/data_prep.py b/egs2/aishell3/tts1/local/data_prep.py
index 66c10bf125e..706c28d5642 100644
--- a/egs2/aishell3/tts1/local/data_prep.py
+++ b/egs2/aishell3/tts1/local/data_prep.py
@@ -1,5 +1,6 @@
 import argparse
 import os
+from espnet2.utils.types import str2bool
 
 SPK_LABEL_LEN = 7
 
@@ -7,7 +8,7 @@
     parser = argparse.ArgumentParser()
     parser.add_argument("--src", type=str)
     parser.add_argument("--dest", type=str)
-    parser.add_argument("--external_g2p", type=bool, default=True)
+    parser.add_argument("--external_g2p", type=str2bool, default=True)
 
     args = parser.parse_args()
 
@@ -25,16 +26,16 @@
 
         (wav_name, text_info) = utt_info.strip().split("\t")
         if args.external_g2p:
-            text_info = text_info.split(" ")[::2]
+            text_info = "".join(text_info.split(" ")[::2])
         else:
-            text_info = text_info.split(" ")[1::2]
+            text_info = " ".join(text_info.split(" ")[1::2])
 
         spk_id = wav_name[:SPK_LABEL_LEN]
         utt_id = wav_name[:-4]
 
         wavscp.write("{} {}\n".format(utt_id, os.path.join(wav_dir, spk_id, wav_name)))
         utt2spk.write("{} {}\n".format(utt_id, spk_id))
-        text.write("{} {}\n".format(utt_id, " ".join(text_info)))
+        text.write("{} {}\n".format(utt_id, text_info))
 
     transcript.close()
     wavscp.close()
diff --git a/egs2/aishell4/enh1/cmd.sh b/egs2/aishell4/enh1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/aishell4/enh1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/aishell4/enh1/conf/pbs.conf b/egs2/aishell4/enh1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/aishell4/enh1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/aishell4/enh1/conf/queue.conf b/egs2/aishell4/enh1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/aishell4/enh1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/aishell4/enh1/conf/slurm.conf b/egs2/aishell4/enh1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/aishell4/enh1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/aishell4/enh1/conf/train.yaml b/egs2/aishell4/enh1/conf/train.yaml
new file mode 120000
index 00000000000..6af07d76cb9
--- /dev/null
+++ b/egs2/aishell4/enh1/conf/train.yaml
@@ -0,0 +1 @@
+tuning/train_enh_beamformer_no_wpe.yaml
\ No newline at end of file
diff --git a/egs2/aishell4/enh1/conf/tuning/train_enh_beamformer_no_wpe.yaml b/egs2/aishell4/enh1/conf/tuning/train_enh_beamformer_no_wpe.yaml
new file mode 100644
index 00000000000..ab0c22fc6c3
--- /dev/null
+++ b/egs2/aishell4/enh1/conf/tuning/train_enh_beamformer_no_wpe.yaml
@@ -0,0 +1,74 @@
+optim: adam
+init: xavier_uniform
+max_epoch: 100
+batch_type: folded
+batch_size: 8
+num_workers: 4
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 0
+patience: 4
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: reducelronplateau
+scheduler_conf:
+    mode: min
+    factor: 0.5
+    patience: 1
+
+encoder: stft
+encoder_conf:
+    n_fft: 512
+    hop_length: 128
+    use_builtin_complex: False
+decoder: stft
+decoder_conf:
+    n_fft: 512
+    hop_length: 128
+separator: wpe_beamformer
+separator_conf:
+    num_spk: 2
+    loss_type: mask_mse
+    use_wpe: False
+    wnet_type: blstmp
+    wlayers: 3
+    wunits: 300
+    wprojs: 320
+    wdropout_rate: 0.0
+    taps: 5
+    delay: 3
+    use_dnn_mask_for_wpe: True
+    use_beamformer: True
+    bnet_type: blstmp
+    blayers: 3
+    bunits: 512
+    bprojs: 512
+    badim: 320
+    ref_channel: 0
+    use_noise_mask: False
+    beamformer_type: mvdr
+    bdropout_rate: 0.0
+    shared_power: False
+
+
+criterions: 
+  # The first criterion
+  - name: mse 
+    conf:
+      compute_on_mask: True
+      mask_type: PSM^2
+    # the wrapper for the current criterion
+    # PIT is widely used in the speech separation task
+    wrapper: pit
+    wrapper_conf:
+      weight: 1.0
diff --git a/egs2/aishell4/enh1/db.sh b/egs2/aishell4/enh1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/aishell4/enh1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/aishell4/enh1/enh.sh b/egs2/aishell4/enh1/enh.sh
new file mode 120000
index 00000000000..8fd33b0b191
--- /dev/null
+++ b/egs2/aishell4/enh1/enh.sh
@@ -0,0 +1 @@
+../../TEMPLATE/enh1/enh.sh
\ No newline at end of file
diff --git a/egs2/aishell4/enh1/local/data.sh b/egs2/aishell4/enh1/local/data.sh
new file mode 100755
index 00000000000..494317e5ed8
--- /dev/null
+++ b/egs2/aishell4/enh1/local/data.sh
@@ -0,0 +1,391 @@
+#!/usr/bin/env bash
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+stage=0       # start from 0 if you need to start from data preparation
+stop_stage=100
+nj=32
+outdir=aishell4_simu
+
+. utils/parse_options.sh || exit 1;
+outdir=$(realpath "$outdir")
+
+help_message=$(cat << EOF
+Usage: $0 [--stage <stage>] [--stop_stage <stop_stage>] [--nj <nj>] [--outdir <outdir>]
+
+  optional argument:
+    [--stage]: 0 (default) to 5
+    [--stop_stage]: 0 to 100 (default)
+    [--nj]: number of parallel processes for data simulation
+    [--outdir]: output directory for storing simulated data (default is "aishell4_simu")
+
+  expected directory structure
+        <AISHELL4>
+         |-- test/
+         |   |-- TextGrid/
+         |   |   |-- L_*.rttm
+         |   |   |-- M_*.rttm
+         |   |   |-- S_*.rttm
+         |   |   |-- L_*.TextGrid
+         |   |   |-- M_*.TextGrid
+         |   |   \-- S_*.TextGrid
+         |   \-- wav/
+         |       |-- L_*.flac
+         |       |-- M_*.flac
+         |       \-- S_*.flac
+         |
+         |-- train_L/
+         |   |-- TextGrid/
+         |   |   |-- *_L_*.rttm
+         |   |   \-- *_L_*.TextGrid
+         |   \-- wav/
+         |       \-- *_L_*.flac
+         |
+         |-- train_M/
+         |   |-- TextGrid/
+         |   |   |-- *_M_*.rttm
+         |   |   \-- *_M_*.TextGrid
+         |   \-- wav/
+         |       \-- *_M_*.flac
+         |
+         \-- train_S/
+             |-- TextGrid/
+             |   |-- *_S_*.rttm
+             |   \-- *_S_*.TextGrid
+             \-- wav/
+                 \-- *_S_*.flac
+EOF
+)
+
+if [ $# -gt 0 ]; then
+    log "${help_message}"
+    exit 2
+fi
+
+
+if [ -z "${AISHELL4}" ]; then
+    log "Fill the value of 'AISHELL4' in db.sh"
+    log "(available at https://www.openslr.org/111/)"
+    exit 1
+fi
+
+if [ ! -e "${LIBRISPEECH}" ]; then
+    log "Fill the value of 'LIBRISPEECH' in db.sh"
+    log "(available at http://openslr.org/12/)"
+    exit 1
+elif [ ! -e "${LIBRISPEECH}/train-clean-100" ]; then
+    log "Please ensure '${LIBRISPEECH}/train-clean-100' exists"
+    exit 1
+elif [ ! -e "${LIBRISPEECH}/train-clean-360" ]; then
+    log "Please ensure '${LIBRISPEECH}/train-clean-360' exists"
+    exit 1
+elif [ ! -e "${LIBRISPEECH}/dev-clean" ]; then
+    log "Please ensure '${LIBRISPEECH}/dev-clean' exists"
+    exit 1
+fi
+
+if [ ! -e "${MUSAN}" ]; then
+    log "Fill the value of 'MUSAN' in db.sh"
+    log "(available at http://openslr.org/17/)"
+    exit 1
+elif [ ! -e "${MUSAN}/noise" ]; then
+    log "Please ensure '${MUSAN}/noise' exists"
+    exit 1
+elif [ ! -e "${MUSAN}/music" ]; then
+    log "Please ensure '${MUSAN}/music' exists"
+    exit 1
+fi
+
+if [ ! -e "${AUDIOSET}" ]; then
+    log "Fill the value of 'AUDIOSET' in db.sh"
+    log "(available at https://github.com/marc-moreaux/audioset_raw)"
+    exit 1
+fi
+
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "Stage 0: Prepare simulation enviroment"
+
+    # Prepare pyrirgen
+    if [ ! -d "local/pyrirgen" ]; then
+        git clone https://github.com/phecda-xu/RIR-Generator.git local/pyrirgen
+        # based on commit ab038671b238fdd8d71df8dabe64137d86947b57
+        patch -i local/pyrirgen.pyx.patch local/pyrirgen/pyrirgen.pyx
+        log "pyrirgen successfully downloaded"
+    fi
+
+    python -m pip install -r local/pyrirgen/requirements.txt
+    curdir=$PWD
+    cd local/pyrirgen
+    make
+    cd "$curdir"
+    log "pyrirgen successfully compiled"
+
+    log "Downloading AISHELL4 repository"
+    URL=https://github.com/felixfuyihui/AISHELL-4.git
+
+    if [ ! -d "${outdir}/AISHELL-4" ] ; then
+        git clone "$URL" "${outdir}/AISHELL-4"
+        # based on commit bad82b77c3753df1b232c5c6491cd3e2f2e32d24
+        patch -i local/generate_rir_trainingdata.py.patch "${outdir}"/AISHELL-4/data_preparation/generate_rir_trainingdata.py
+        patch -i local/generate_isotropic_noise.py.patch "${outdir}"/AISHELL-4/data_preparation/generate_isotropic_noise.py
+        patch -i local/generate_fe_trainingdata.py.patch "${outdir}"/AISHELL-4/data_preparation/generate_fe_trainingdata.py
+        log "git successfully downloaded"
+    fi
+
+    python -m pip install -r "${outdir}/AISHELL-4"/requirements.txt
+
+    # overwrite pyrirgen-related libraries provided in the AISHELL-4 repository,
+    # because they were bound to a specific Python version
+    rm "${outdir}"/AISHELL-4/data_preparation/librirgen.so
+    rm "${outdir}"/AISHELL-4/data_preparation/pyrirgen.*.so
+    ln -s "${curdir}"/local/pyrirgen/librirgen.so "${outdir}"/AISHELL-4/data_preparation/
+    ln -s "${curdir}"/local/pyrirgen/pyrirgen.*.so "${outdir}"/AISHELL-4/data_preparation/
+fi
+
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ] ; then  
+    log "Stage 1: Simulate RIRs"
+
+    mkdir -p "${outdir}/data/rirs"
+    mkdir -p "${outdir}/data/log"
+    export LD_LIBRARY_PATH="${outdir}/AISHELL-4/data_preparation":$LD_LIBRARY_PATH
+
+    curdir=$PWD
+    # You may want to manually modify this file if you made modifications to conf/slurm.conf
+    sed -e 's/--export=PATH$/--export=PATH,LD_LIBRARY_PATH/' "${curdir}/../../TEMPLATE/enh1/conf/slurm.conf" > "${outdir}/data/log/slurm.conf"
+    cd "${outdir}"/AISHELL-4/data_preparation/ || exit 1
+    # This takes ~6.5 hours with Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHZ (nj=32).
+    # In total, ~4.5 GB data will be generated.
+    ${train_cmd} --config "${outdir}/data/log/slurm.conf" JOB=1:${nj} "${outdir}"/data/log/rir.JOB.log \
+        python ./generate_rir_trainingdata.py JOB ${nj} \
+            --output_dir "${outdir}/data/rirs" \
+            --seed 1
+    cd "$curdir"
+
+    # This should generate 12500 RIRs for 2500 rooms.
+    # Each RIR data has shape (24, 8000). The 0-8, 8-16, and 16-24 channels correspond to three sources, respectively.
+    find "${outdir}/data/rirs" -iname "*.wav" | sort > "${outdir}/data/rirs.lst"
+    num_rir=$(<"${outdir}/data/rirs.lst" wc -l)
+    if [ ${num_rir} -ne 12500 ]; then
+        log "Error: Expected 12500 wav files, but got ${num_rir}"
+        exit 1
+    fi
+
+    # 12500 = 11875 (train) + 625 (dev)
+    # This script will generate train_rirs.lst and dev_rirs.lst under "${outdir}/data/"
+    python local/split_train_dev_by_prefix.py "${outdir}/data/rirs.lst" \
+        --num_dev "1/20" \
+        --outfile "${outdir}/data/{}_rirs.lst" \
+        --delim "_" \
+        --prefix_num 3 \
+        --mode "same_size_group"
+fi
+
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "Stage 2: Simulate isotropic noise"
+
+    mkdir -p "${outdir}/data/isotropic_noise"
+    # This takes ~1 hour with Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz.
+    # 200 wav files (~2.9 GB) will be generated.
+    # Each noise sample has shape (8, 960000).
+    python "${outdir}"/AISHELL-4/data_preparation/generate_isotropic_noise.py \
+        --output_dir "${outdir}/data/isotropic_noise" \
+        --wavnum 200 \
+        --seed 1
+
+    # sort in numerically increasing order
+    find "${outdir}/data/isotropic_noise" -iname "*.wav" | \
+        awk -F"${outdir}/data/isotropic_noise/isotropic_" '{print $0, $2}' | \
+        sort -k2 -n | cut -d' ' -f1 > "${outdir}/data/isotropic_noise.lst"
+
+    head -n 190 "${outdir}/data/isotropic_noise.lst" > "${outdir}/data/train_isotropic_noise.lst"
+    tail -n 10 "${outdir}/data/isotropic_noise.lst" > "${outdir}/data/dev_isotropic_noise.lst"
+fi
+
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    log "Stage 3: Prepare speech and noise lists"
+
+    ####################################################
+    # prepare speech list for training and development #
+    ####################################################
+    find "${LIBRISPEECH}/train-clean-100/" "${LIBRISPEECH}/train-clean-360/" -iname "*.flac" | sort > "${outdir}/data/train_speech.lst"
+    find "${LIBRISPEECH}/dev-clean/" -iname "*.flac" | sort > "${outdir}/data/dev_speech.lst"
+
+    # This script will generate train_train_speech.lst and dev_train_speech.lst under "${outdir}/data/"
+    python local/split_train_dev_by_prefix.py "${outdir}/data/train_speech.lst" \
+        --num_dev "1/2" \
+        --outfile "${outdir}/data/{}_train_speech.lst" \
+        --delim "-" \
+        --prefix_num 2 \
+        --mode "similar_size_group"
+
+    # This script will generate train_dev_speech.lst and dev_dev_speech.lst under "${outdir}/data/"
+    python local/split_train_dev_by_prefix.py "${outdir}/data/dev_speech.lst" \
+        --num_dev "1/2" \
+        --outfile "${outdir}/data/{}_dev_speech.lst" \
+        --delim "-" \
+        --prefix_num 2 \
+        --mode "similar_size_group"
+
+    mv "${outdir}/data/train_train_speech.lst" "${outdir}/data/train_speech_spk1.lst"
+    mv "${outdir}/data/dev_train_speech.lst" "${outdir}/data/train_speech_spk2.lst"
+    mv "${outdir}/data/train_dev_speech.lst" "${outdir}/data/dev_speech_spk1.lst"
+    mv "${outdir}/data/dev_dev_speech.lst" "${outdir}/data/dev_speech_spk2.lst"
+
+    ####################################################
+    # prepare noise lists for training and development #
+    ####################################################
+    wget -O "${outdir}/data/audioset.name" https://raw.githubusercontent.com/ConferencingSpeech/ConferencingSpeech2021/master/selected_lists/train/audioset.name
+    wget -O "${outdir}/data/musan.name" https://raw.githubusercontent.com/ConferencingSpeech/ConferencingSpeech2021/master/selected_lists/train/musan.name
+
+    # append category information to the raw audio list
+    sed -i -e 's/\(\(\w\+\-\)\+\w\+\)-[0-9]\+\.wav/\0 \1/g' "${outdir}/data/musan.name"
+    python local/prepare_audioset_category_list.py \
+        "${outdir}/data/audioset.name" \
+        --audioset_dir "$AUDIOSET" \
+        --output_file "${outdir}/data/audioset.name"
+
+    python local/prepare_data_list.py \
+        --outfile "${outdir}/data/musan.lst" \
+        --audiodirs "$MUSAN" \
+        --audio-format "wav" \
+        "${outdir}/data/musan.name"
+
+    python local/prepare_data_list.py \
+        --outfile "${outdir}/data/audioset.lst" \
+        --audiodirs "$AUDIOSET" \
+        --audio-format "wav" \
+        --ignore-missing-files True \
+        "${outdir}/data/audioset.name"
+
+    # This script will generate train_musan.lst and dev_musan.lst under "${outdir}/data/"
+    # 988 = 945 (train) + 43 (dev)
+    python local/split_train_dev_by_column.py "${outdir}/data/musan.lst" \
+        --num_dev "43" \
+        --outfile "${outdir}/data/{}_musan.lst" \
+        --mode "similar_size_group"
+
+    # This script will generate train_audioset.lst and dev_audioset.lst under "${outdir}/data/"
+    # 22418 = 21297 (train) + 1121 (dev)
+    python local/split_train_dev_by_column.py "${outdir}/data/audioset.lst" \
+        --num_dev "1/20" \
+        --outfile "${outdir}/data/{}_audioset.lst" \
+        --mode "similar_size_group"
+
+    cat "${outdir}"/data/train_{musan,audioset}.lst > "${outdir}/data/train_noise.lst"
+    cat "${outdir}"/data/dev_{musan,audioset}.lst > "${outdir}/data/dev_noise.lst"
+    rm "${outdir}/data"/{train,dev}_musan.lst "${outdir}/data"/{train,dev}_audioset.lst
+fi
+
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    log "Stage 4: Simulate training and development data"
+
+    log "Simulating training data"
+    mkdir -p "${outdir}/data/wavs/train"
+    # This takes ~20.5 hours with Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz.
+    # 60000 * 3 (mix, spk1, spk2) wav files (~128 GB) will be generated.
+    # Each audio sample (16 kHz) has 8 channels.
+    python "${outdir}"/AISHELL-4/data_preparation/generate_fe_trainingdata.py \
+        --spk1_list "${outdir}/data/train_speech_spk1.lst" \
+        --spk2_list "${outdir}/data/train_speech_spk2.lst" \
+        --noise_list "${outdir}/data/train_noise.lst" \
+        --rir_list "${outdir}/data/train_rirs.lst" \
+        --isotropic_list "${outdir}/data/train_isotropic_noise.lst" \
+        --mode "train" \
+        --output_dir "${outdir}/data/wavs" \
+        --wavnum 60000
+
+    log "Simulating development data"
+    mkdir -p "${outdir}/data/wavs/dev"
+    # This takes ~1 hour with Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz.
+    # 3000 * 3 (mix, spk1, spk2) wav files (~6.2 GB) will be generated.
+    # Each audio sample (16 kHz) has 8 channels.
+    python "${outdir}"/AISHELL-4/data_preparation/generate_fe_trainingdata.py \
+        --spk1_list "${outdir}/data/dev_speech_spk1.lst" \
+        --spk2_list "${outdir}/data/dev_speech_spk2.lst" \
+        --noise_list "${outdir}/data/dev_noise.lst" \
+        --rir_list "${outdir}/data/dev_rirs.lst" \
+        --isotropic_list "${outdir}/data/dev_isotropic_noise.lst" \
+        --mode "dev" \
+        --output_dir "${outdir}/data/wavs" \
+        --wavnum 3000
+fi
+
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    log "Stage 5: Prepare training and development data"
+
+    mkdir -p data/train
+    find "${outdir}/data/wavs/train/mix" -iname "*.wav" > "${outdir}/data/wavs/train/mix.lst"
+    sed -e 's/\.\(wav\|flac\)//' "${outdir}/data/wavs/train/mix.lst" | \
+        awk -F '/' '{print $NF}' > "${outdir}/data/wavs/train/mix_id.lst"
+    paste -d' ' "${outdir}/data/wavs/train/mix_id.lst" "${outdir}/data/wavs/train/mix.lst" | sort -u > data/train/wav.scp
+    sed -e "s#${outdir}/data/wavs/train/mix/#${outdir}/data/wavs/train/spk1/#g" data/train/wav.scp > data/train/spk1.scp
+    sed -e "s#${outdir}/data/wavs/train/mix/#${outdir}/data/wavs/train/spk2/#g" data/train/wav.scp > data/train/spk2.scp
+    awk '{print $1, "dummy"}' data/train/wav.scp > data/train/utt2spk
+    utils/utt2spk_to_spk2utt.pl data/train/utt2spk > data/train/spk2utt
+    utils/validate_data_dir.sh --no-feats --no-text data/train
+    rm "${outdir}/data/wavs/train/mix_id.lst"
+
+    mkdir -p data/dev
+    find "${outdir}/data/wavs/dev/mix" -iname "*.wav" > "${outdir}/data/wavs/dev/mix.lst"
+    sed -e 's/\.\(wav\|flac\)//' "${outdir}/data/wavs/dev/mix.lst" | \
+        awk -F '/' '{print $NF}' > "${outdir}/data/wavs/dev/mix_id.lst"
+    paste -d' ' "${outdir}/data/wavs/dev/mix_id.lst" "${outdir}/data/wavs/dev/mix.lst" | sort -u > data/dev/wav.scp
+    sed -e "s#${outdir}/data/wavs/dev/mix/#${outdir}/data/wavs/dev/spk1/#g" data/dev/wav.scp > data/dev/spk1.scp
+    sed -e "s#${outdir}/data/wavs/dev/mix/#${outdir}/data/wavs/dev/spk2/#g" data/dev/wav.scp > data/dev/spk2.scp
+    awk '{print $1, "dummy"}' data/dev/wav.scp > data/dev/utt2spk
+    utils/utt2spk_to_spk2utt.pl data/dev/utt2spk > data/dev/spk2utt
+    utils/validate_data_dir.sh --no-feats --no-text data/dev
+    rm "${outdir}/data/wavs/dev/mix_id.lst"
+fi
+
+
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+    log "Stage 6: Prepare the evaluation data for the Speaker Independent task"
+
+    find "${AISHELL4}/test/wav" -iname "*.flac" > "${outdir}"/data/test_wav.lst
+    find "${AISHELL4}/test/TextGrid" -iname "*.TextGrid" > "${outdir}"/data/test_TextGrid.lst
+
+    mkdir -p "${outdir}/data/wavs/test"
+    # This takes ~12 minutes with Intel(R) Xeon(R) Gold 6132 CPU @ 2.60GHz.
+    # 10381 wav files (~11 GB) will be generated.
+    # Each audio sample (16 kHz) has 8 channels.
+    python "${outdir}"/AISHELL-4/data_preparation/generate_nospk_testdata.py \
+        --wav_list "${outdir}/data/test_wav.lst" \
+        --textgrid_list "${outdir}/data/test_TextGrid.lst" \
+        --output_dir "${outdir}/data/wavs"
+
+    mkdir -p data/test
+    find "${outdir}/data/wavs/test" -iname "*.wav" > "${outdir}/data/wavs/test/wav.lst"
+    sed -e 's/\.\(wav\|flac\)//' "${outdir}/data/wavs/test/wav.lst" | \
+        awk -F '/' '{print $NF}' > "${outdir}/data/wavs/test/wav_id.lst"
+    paste -d' ' "${outdir}/data/wavs/test/wav_id.lst" "${outdir}/data/wavs/test/wav.lst" | sort -u > data/test/wav.scp
+    ln -s wav.scp data/test/spk1.scp
+    ln -s wav.scp data/test/spk2.scp
+    awk '{print $1, "dummy"}' data/test/wav.scp > data/test/utt2spk
+    utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt
+    utils/validate_data_dir.sh --no-feats --no-text data/test
+    rm "${outdir}/data/wavs/test/wav_id.lst"
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/aishell4/enh1/local/generate_fe_trainingdata.py.patch b/egs2/aishell4/enh1/local/generate_fe_trainingdata.py.patch
new file mode 100644
index 00000000000..a7666a5a756
--- /dev/null
+++ b/egs2/aishell4/enh1/local/generate_fe_trainingdata.py.patch
@@ -0,0 +1,51 @@
+--- generate_fe_trainingdata.old.py
++++ generate_fe_trainingdata.new.py
+@@ -1,8 +1,8 @@
+ #!/usr/bin/env python
+ 
+-import io
++from distutils.version import LooseVersion
+ import os
+-import subprocess
++import sys
+ import linecache
+ import numpy as np
+ import soundfile as sf
+@@ -12,6 +12,10 @@
+ import librosa
+ import argparse
+ 
++
++is_py_3_3_plus = LooseVersion(sys.version) > LooseVersion("3.3")
++
++
+ def get_line_context(file_path, line_number):
+     return linecache.getline(file_path, line_number).strip()
+ 
+@@ -119,7 +123,7 @@
+         return data / max_val
+ 
+ def add_noise(clean, noise, rir, snr):
+-    random.seed(time.clock())
++    random.seed(time.perf_counter() if is_py_3_3_plus else time.clock())
+     if len(noise.shape) == 1 and len(clean.shape) > 1:
+         noise = add_reverb(noise, rir[:, 16:24])
+         noise = noise[:-7999]
+@@ -189,7 +193,7 @@
+ 
+     for i in range(args.wavnum):
+ 
+-        random.seed(time.clock())
++        random.seed(time.perf_counter() if is_py_3_3_plus else time.clock())
+         wav1idx = random.randint(0, len(open(wavlist1,'r').readlines())-1)
+         wav2idx = random.randint(0, len(open(wavlist2,'r').readlines())-1)
+         noiseidx = random.randint(0, len(open(noiselist,'r').readlines())-1)
+@@ -200,7 +204,7 @@
+         noise_path = get_line_context(noiselist, noiseidx+1)
+         rir_path = get_line_context(rirlist, riridx+1)
+         isotropic_path = get_line_context(isolist, isotropicidx+1)
+-        random.seed(time.clock())
++        random.seed(time.perf_counter() if is_py_3_3_plus else time.clock())
+         snr = random.uniform(5, 20)
+         sir = random.uniform(-5, 5)
+         isosnr = random.uniform(15,25)
diff --git a/egs2/aishell4/enh1/local/generate_isotropic_noise.py.patch b/egs2/aishell4/enh1/local/generate_isotropic_noise.py.patch
new file mode 100644
index 00000000000..e4c2184ade3
--- /dev/null
+++ b/egs2/aishell4/enh1/local/generate_isotropic_noise.py.patch
@@ -0,0 +1,21 @@
+--- generate_isotropic_noise.old.py
++++ generate_isotropic_noise.new.py
+@@ -227,6 +227,7 @@
+ 
+ 
+ def run(args):
++    np.random.seed(args.seed)
+     if not os.path.exists(args.output_dir):
+         os.makedirs(args.output_dir)
+     mic_distance = 0.05
+@@ -256,5 +257,9 @@
+                         type=int,
+                         help="total number of simulated wavs",
+                         default=200)
++    parser.add_argument("--seed",
++                        type=int,
++                        help="random seed",
++                        default=1)
+     args = parser.parse_args()
+     run(args)
+\ No newline at end of file
diff --git a/egs2/aishell4/enh1/local/generate_rir_trainingdata.py.patch b/egs2/aishell4/enh1/local/generate_rir_trainingdata.py.patch
new file mode 100644
index 00000000000..3907630e19f
--- /dev/null
+++ b/egs2/aishell4/enh1/local/generate_rir_trainingdata.py.patch
@@ -0,0 +1,117 @@
+--- /Users/wyz/Downloads/generate_rir_trainingdata.old.py	2022-03-25 19:13:20.000000000 +0800
++++ /Users/wyz/Downloads/generate_rir_trainingdata.new.py	2022-03-26 22:31:53.000000000 +0800
+@@ -13,6 +13,7 @@
+ import os
+ 
+ def run(args):
++    np.random.seed(args.seed)
+     if not os.path.exists(args.output_dir):
+         os.makedirs(args.output_dir)
+     c = 340
+@@ -25,11 +26,31 @@
+     nb_src = 3  # Number of sources
+     nb_rcv = 8 # Number of receivers
+ 
+-    for i in range(num_room):
++    if args.idx < 1 or args.nj < 1 or args.idx > args.nj:
++        raise ValueError("Invalid arguments for multi-processing")
++
++    num = math.ceil(num_room / args.nj)
++    start = (args.idx - 1) * num
++    if args.idx < args.nj:
++        stop = start + num
++    else:
++        stop = num_room
++
++    print('Simulate RIRs from %d to %d\n' % (start, stop))
++    unique_rooms = []
++    for i in range(stop):
+         x = np.random.uniform(3, room_x)
+         y = np.random.uniform(3, room_y)
+         z = room_z
+-        for j in range(utt_per_room):
++        room_stats = ('%.2f' % x) + '_' + ('%.2f' % y) + '_' + ('%.2f' % z)
++        while room_stats in unique_rooms:
++            x = np.random.uniform(3, room_x)
++            y = np.random.uniform(3, room_y)
++            room_stats = ('%.2f' % x) + '_' + ('%.2f' % y) + '_' + ('%.2f' % z)
++        unique_rooms.append(room_stats)
++
++        j = 0
++        while j < utt_per_room:
+             mic_distance = 0.05
+             room_sz = [x, y, z]  # Size of the room [m]
+             pos_src1 = [np.random.uniform(0, x),np.random.uniform(0, y),np.random.uniform(1.2, 1.9)]
+@@ -55,21 +76,6 @@
+             mic_pattern = "omnidirectional" # Receiver polar pattern
+             T60 = np.random.uniform(0.2, 0.8)    # Time for the RIR to reach 60dB of attenuation [s]
+ 
+-            RIRs1 = pyrirgen.generateRir(room_sz, pos_src1, pos_rcv, soundVelocity=c, fs=fs, reverbTime = T60, nSamples = 8000, 
+-                                        micType = mic_pattern, nOrder=-1, nDim=3, isHighPassFilter=True) #source * mic * time
+-            RIRs2 = pyrirgen.generateRir(room_sz, pos_src2, pos_rcv, soundVelocity=c, fs=fs, reverbTime = T60, nSamples = 8000, 
+-                                        micType = mic_pattern, nOrder=-1, nDim=3, isHighPassFilter=True) #source * mic * time
+-            RIRs3 = pyrirgen.generateRir(room_sz, pos_src3, pos_rcv, soundVelocity=c, fs=fs, reverbTime = T60, nSamples = 8000, 
+-                                        micType = mic_pattern, nOrder=-1, nDim=3, isHighPassFilter=True) #source * mic * time
+-            RIRs1=np.array(RIRs1)
+-            RIRs2=np.array(RIRs2)
+-            RIRs3=np.array(RIRs3)
+-            
+-            out = np.zeros([24, 8000])
+-            out[0:8]= RIRs1
+-            out[8:16]= RIRs2
+-            out[16:24]= RIRs3
+-            out = out.transpose(1,0)
+             pos_src = np.array(pos_src1)
+             pos_src2 = np.array(pos_src2)
+             pos_src3 = np.array(pos_src3)
+@@ -89,17 +95,50 @@
+             matrix_1m = np.array([0.5, 0.5, 0.5])
+             matrix_5m = np.array([5.0, 5.0, 5.0])
+             if (distance>matrix_1m).all() and (distance<matrix_5m).all() and abs(angle[1] - angle[0])>20.0 and abs(angle[2] - angle[0])>20.0 and abs(angle[2] - angle[1])>20.0:
++                j += 1
++                if i < start:
++                    # Skip the actual simulation util it reachs the index range of the current process.
++                    # This is to ensure that the generated data is consistent when using a different number of processes.
++                    continue
+                 wav_name = args.output_dir+'/' + \
+                            ('%.2f' % x) + '_' + ('%.2f' % y) + '_' + ('%.2f' % z) + '_' + \
+                            ('%.2f' % distance[0]) + '_' + ('%.2f' % distance[1]) + '_' + \
+                            ('%.4f' % angle[0]) + '_' + ('%.4f' % angle[1])+ '_' + ('%.4f' % T60) + '.wav'
++
++                RIRs1 = pyrirgen.generateRir(room_sz, pos_src1, pos_rcv, soundVelocity=c, fs=fs, reverbTime = T60, nSamples = 8000, 
++                                        micType = mic_pattern, nOrder=-1, nDim=3, isHighPassFilter=True) #source * mic * time
++                RIRs2 = pyrirgen.generateRir(room_sz, pos_src2, pos_rcv, soundVelocity=c, fs=fs, reverbTime = T60, nSamples = 8000, 
++                                            micType = mic_pattern, nOrder=-1, nDim=3, isHighPassFilter=True) #source * mic * time
++                RIRs3 = pyrirgen.generateRir(room_sz, pos_src3, pos_rcv, soundVelocity=c, fs=fs, reverbTime = T60, nSamples = 8000, 
++                                            micType = mic_pattern, nOrder=-1, nDim=3, isHighPassFilter=True) #source * mic * time
++                RIRs1=np.array(RIRs1)
++                RIRs2=np.array(RIRs2)
++                RIRs3=np.array(RIRs3)
++                
++                out = np.zeros([24, 8000])
++                out[0:8]= RIRs1
++                out[8:16]= RIRs2
++                out[16:24]= RIRs3
++                out = out.transpose(1,0)
++
+                 sf.write(wav_name,out,16000)
+ 
++
+ if __name__ == '__main__':
+     parser = argparse.ArgumentParser()
++    parser.add_argument("idx",
++                        type=int,
++                        help="index of the current process (from 1 to `nj`)")
++    parser.add_argument("nj",
++                        type=int,
++                        help="total number of parallel processes")
+     parser.add_argument("--output_dir",
+                         type=str,
+                         help="output_dir",
+                         default="rir")
++    parser.add_argument("--seed",
++                        type=int,
++                        help="random seed",
++                        default=1)
+     args = parser.parse_args()
+     run(args)
diff --git a/egs2/aishell4/enh1/local/path.sh b/egs2/aishell4/enh1/local/path.sh
new file mode 100644
index 00000000000..6322b27f35c
--- /dev/null
+++ b/egs2/aishell4/enh1/local/path.sh
@@ -0,0 +1 @@
+export PYTHONPATH=$PWD/local:$PYTHONPATH
diff --git a/egs2/aishell4/enh1/local/prepare_audioset_category_list.py b/egs2/aishell4/enh1/local/prepare_audioset_category_list.py
new file mode 100644
index 00000000000..2c9a09bb0c6
--- /dev/null
+++ b/egs2/aishell4/enh1/local/prepare_audioset_category_list.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python
+
+# Copyright 2022  Shanghai Jiao Tong University (Author: Wangyou Zhang)
+# Apache 2.0
+from pathlib import Path
+import re
+import sys
+
+
+def prepare_audioset_category(audio_list, audioset_dir, output_file, skip_csv_rows=3):
+    audios = []
+    with Path(audio_list).open("r", encoding="utf-8") as f:
+        for line in f:
+            if not line.strip():
+                continue
+            audios.append(line.strip())
+
+    utt2category = {}
+    for csv in Path(audioset_dir).rglob("*.csv"):
+        with csv.open("r", encoding="utf-8") as f:
+            for idx, line in enumerate(f):
+                if idx < skip_csv_rows:
+                    continue
+                # --PJHxphWEs, 30.000, 40.000, "/m/09x0r,/t/dd00088"
+                try:
+                    YTID, start_seconds, end_seconds, positive_labels = re.split(
+                        r",\s*", line.strip(), maxsplit=3
+                    )
+                except ValueError as err:
+                    exc_type, exc_obj, exc_tb = sys.exc_info()
+                    err_file = Path(exc_tb.tb_frame.f_code.co_filename).name
+                    err_file_line = exc_tb.tb_lineno
+                    print(
+                        "=== Warning: skipping '%s' due to the following error ==="
+                        % csv
+                    )
+                    print(
+                        "  [%s (line %s)] %s: %s"
+                        % (err_file, err_file_line, exc_type.__name__, err)
+                    )
+                    break
+                positive_labels = re.sub(r'^"(.*)"$', r"\1", positive_labels)
+                positive_labels = ",".join(sorted(re.split(r",\s*", positive_labels)))
+                utt2category[YTID] = positive_labels
+
+    ret = []
+    for audio in audios:
+        ytid = re.sub(r"(.*)_\d+\.\d+", r"\1", Path(audio).stem)
+        ret.append("%s %s\n" % (ytid, utt2category[ytid]))
+
+    outfile = Path(output_file)
+    outfile.parent.mkdir(parents=True, exist_ok=True)
+    with outfile.open("w") as f:
+        for line in ret:
+            f.write(line)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "audio_list",
+        type=str,
+        help="Path to a text file containing the list of audios in Audioset",
+    )
+    parser.add_argument(
+        "--audioset_dir",
+        type=str,
+        required=True,
+        help="Path to the Audioset root directory",
+    )
+    parser.add_argument(
+        "--skip_csv_rows",
+        type=str,
+        default=3,
+        help="Line numbers to skip from top while reading csv",
+    )
+    parser.add_argument(
+        "--output_file",
+        type=str,
+        required=True,
+        help="Path to the file for write audio list with category information",
+    )
+    args = parser.parse_args()
+
+    prepare_audioset_category(
+        args.audio_list,
+        args.audioset_dir,
+        args.output_file,
+        skip_csv_rows=args.skip_csv_rows,
+    )
diff --git a/egs2/aishell4/enh1/local/prepare_data_list.py b/egs2/aishell4/enh1/local/prepare_data_list.py
new file mode 120000
index 00000000000..bb938f78963
--- /dev/null
+++ b/egs2/aishell4/enh1/local/prepare_data_list.py
@@ -0,0 +1 @@
+../../../conferencingspeech21/enh1/local/prepare_data_list.py
\ No newline at end of file
diff --git a/egs2/aishell4/enh1/local/pyrirgen.pyx.patch b/egs2/aishell4/enh1/local/pyrirgen.pyx.patch
new file mode 100644
index 00000000000..4159c7a48b9
--- /dev/null
+++ b/egs2/aishell4/enh1/local/pyrirgen.pyx.patch
@@ -0,0 +1,11 @@
+--- pyrirgen.old.pyx
++++ pyrirgen.new.pyx
+@@ -1,7 +1,7 @@
+ cimport cdefs
+ import collections
+ 
+-def rir_generator(soundVelocity, fs, sourcePosition, receiverPositions, roomMeasures, *, reverbTime=None, betaCoeffs=None, orientation=[.0, .0], bint isHighPassFilter=True, int nDim=3, int nOrder=-1, int nSamples=-1, micType='o'):
++def generateRir(roomMeasures, sourcePosition, receiverPositions, *, reverbTime=None, betaCoeffs=None, float soundVelocity=340, float fs=16000, orientation=[.0, .0], bint isHighPassFilter=True, int nDim=3, int nOrder=-1, int nSamples=-1, micType='o'):
+ 	""" Computes the response of an acoustic source to one or more microphones in a reverberant room using the image method [1,2].
+ 
+ 	Room Impulse Response Generator
diff --git a/egs2/aishell4/enh1/local/split_train_dev.py b/egs2/aishell4/enh1/local/split_train_dev.py
new file mode 100755
index 00000000000..8961c40b12d
--- /dev/null
+++ b/egs2/aishell4/enh1/local/split_train_dev.py
@@ -0,0 +1,241 @@
+#!/usr/bin/env python
+
+# Copyright 2022  Shanghai Jiao Tong University (Authors: Wangyou Zhang)
+# Apache 2.0
+from collections import Counter
+from collections import defaultdict
+from fractions import Fraction
+import math
+from pathlib import Path
+import random
+from typing import List
+from typing import Tuple
+
+
+def int_or_float_or_numstr(value):
+    if isinstance(value, int):
+        return value
+    elif isinstance(value, float):
+        assert 0 < value < 1, value
+        return Fraction(value)
+    elif isinstance(value, (str, Fraction)):
+        num = Fraction(value)
+        if num.denominator == 1:
+            return num.numerator  # int
+        else:
+            return num
+    else:
+        raise TypeError("Unsupported value type: %s" % type(value))
+
+
+def split_train_dev(all_data, num_dev, outfile):
+    """Group all paths listed in `datalist` according to their name prefix,
+    and split all groups into train and dev subsets.
+
+    Each subset has distinct groups.
+
+    This is designed for splitting same-size groups into train and dev subsets.
+    That is, all groups must have exactly the same number of paths.
+
+    Args:
+        all_data (Dict[group_name: List[str]]): all paths grouped by their category
+        num_dev (int or Fraction): number/percentage of the samples for the dev set
+        outfile (str): template path to the ourput file
+    """  # noqa: H405
+
+    # all groups must have the same number of paths
+    group_id0 = next(iter(all_data.keys()))
+    group_length = len(all_data[group_id0])
+    for k, v in all_data.items():
+        assert len(v) == group_length, (k, len(v), group_length)
+    total_num = group_length * len(all_data)
+
+    # determine number of groups for dev subset
+    if isinstance(num_dev, int):
+        assert num_dev % group_length == 0, (num_dev, group_length)
+        num_dev_groups = num_dev // group_length
+    elif isinstance(num_dev, Fraction):
+        num_dev_groups = num_dev * total_num / group_length
+        if num_dev_groups.denominator == 1:
+            num_dev_groups = num_dev_groups.numerator
+        else:
+            num_dev_groups = round(num_dev_groups)
+            print(
+                "Warning: num_dev_groups is rounded to the nearest integer "
+                f"{num_dev_groups}."
+            )
+    else:
+        raise TypeError("Unsupported data type: %s" % type(num_dev))
+
+    groups = list(all_data.keys())
+    random.shuffle(groups)
+    dev_groups = groups[:num_dev_groups]
+    train_groups = groups[num_dev_groups:]
+
+    outdir = Path(outfile).expanduser().resolve().parent
+    outdir.mkdir(parents=True, exist_ok=True)
+    assert "{}" in outfile, outfile
+    with Path(outfile.format("dev")).open("w") as out:
+        for room in dev_groups:
+            for wav in all_data[room]:
+                out.write(wav)
+    with Path(outfile.format("train")).open("w") as out:
+        for room in train_groups:
+            for wav in all_data[room]:
+                out.write(wav)
+
+
+def split_train_dev_v2(
+    all_data, num_dev, outfile, allowed_deviation=0, max_solutions=50
+):
+    """Group all paths listed in args.datalist according to their name prefix,
+    and split all groups into train and dev subsets.
+
+    Each subset has distinct groups.
+
+    This is designed for splitting similar-size groups into train and dev subsets.
+    That is, all groups must have, if not the same, a similar number of paths.
+
+    Args:
+        all_data (Dict[group_name: List[str]]): all paths grouped by their category
+        num_dev (int or Fraction): number/percentage of the samples for the dev set
+        outfile (str): template path to the ourput file
+        allowed_deviation (int): Allowed number of samples for the final dev split to be
+                                less than or more than the specified `num_dev`
+        max_solutions (int): maximum number of possible coin change solutions to search
+    """  # noqa: H405
+
+    lengths = [len(v) for v in all_data.values()]
+    total_length = sum(lengths)
+    mean_length = total_length / len(lengths)
+    print(
+        f"len(group_lengths)={len(lengths)}\n"
+        f"max(group_lengths)={max(lengths)}, min(group_lengths)={min(lengths)}, "
+        f"mean(group_lengths)={mean_length:.2f}\n"
+    )
+
+    # determine number of groups for dev subset
+    if isinstance(num_dev, int):
+        num_dev_samples = num_dev
+    elif isinstance(num_dev, Fraction):
+        num_dev_samples = num_dev * total_length
+        if num_dev_samples.denominator == 1:
+            num_dev_samples = num_dev_samples.numerator
+        else:
+            num_dev_samples = round(num_dev_samples)
+            print(
+                "Warning: num_dev_samples is rounded to the nearest integer "
+                f"({num_dev_samples})."
+            )
+    else:
+        raise TypeError("Unsupported data type: %s" % type(num_dev))
+
+    # Solve this assignment problem like the recursive Coin Change Problem
+    choices = find_all_coin_change_ways(
+        lengths,
+        num_dev_samples,
+        allowed_deviation=allowed_deviation,
+        max_solutions=max_solutions,
+    )
+    if len(choices) == 0:
+        raise ValueError(
+            "Current find an exact solution to match num_dev_samples (=%d)\n"
+            "Please modify --num_dev or consider using a larger value for "
+            "--allowed_deviation" % num_dev_samples
+        )
+
+    choice = random.choice(choices)
+    groups_count = Counter(choice)
+    pool = defaultdict(list)
+    groups = list(all_data.keys())
+    for i, length in enumerate(lengths):
+        if length in groups_count:
+            pool[length].append(i)
+
+    selected_idx = []
+    dev_groups = []
+    for length, num in groups_count.items():
+        name_idxs = random.sample(pool[length], num)
+        selected_idx.extend(name_idxs)
+        for idx in name_idxs:
+            dev_groups.append(groups[idx])
+    for idx in sorted(selected_idx, reverse=True):
+        groups.pop(idx)
+    train_groups = groups
+
+    outdir = Path(outfile).expanduser().resolve().parent
+    outdir.mkdir(parents=True, exist_ok=True)
+    assert "{}" in outfile, outfile
+    with Path(outfile.format("dev")).open("w") as out:
+        for room in dev_groups:
+            for wav in all_data[room]:
+                out.write(wav)
+    with Path(outfile.format("train")).open("w") as out:
+        for room in train_groups:
+            for wav in all_data[room]:
+                out.write(wav)
+
+
+def find_all_coin_change_ways(
+    coins: List, amount: int, allowed_deviation: int = 0, max_solutions=50
+):
+    def coin_change(unique_coins: List, amount: int, tmp_ret: Tuple = ()):
+        """Search in ascending order."""
+        if len(unique_coins) == 0 or len(all_combinations) > max_solutions:
+            return
+        biggest_coin, rest_coins = unique_coins[0], unique_coins[1:]
+        if allowed_deviation > 0:
+            num = math.ceil(amount / biggest_coin)
+        else:
+            num = amount // biggest_coin
+        for i in range(1 + num):
+            if i > coin_num[biggest_coin]:
+                break
+            remainder = amount - biggest_coin * i
+            if abs(remainder) <= allowed_deviation:
+                new_combo = tmp_ret + (biggest_coin,) * i
+                if allowed_deviation > 0:
+                    if len(all_combinations) == 0 or new_combo != all_combinations[-1]:
+                        all_combinations.append(new_combo)
+                    coin_change(rest_coins, remainder, tmp_ret + (biggest_coin,) * i)
+                else:
+                    all_combinations.append(new_combo)
+                    break
+            elif remainder > 0:
+                coin_change(rest_coins, remainder, tmp_ret + (biggest_coin,) * i)
+            else:
+                break
+
+    def coin_change_v2(unique_coins: List, amount: int, tmp_ret: Tuple = ()):
+        """Search in descending order."""
+        if len(unique_coins) == 0 or len(all_combinations) > max_solutions:
+            return
+        biggest_coin, rest_coins = unique_coins[0], unique_coins[1:]
+        if allowed_deviation > 0:
+            num = min(coin_num[biggest_coin], math.ceil(amount / biggest_coin))
+        else:
+            num = min(coin_num[biggest_coin], amount // biggest_coin)
+        for i in range(num, -1, -1):
+            remainder = amount - biggest_coin * i
+            if abs(remainder) <= allowed_deviation:
+                new_combo = tmp_ret + (biggest_coin,) * i
+                if allowed_deviation > 0:
+                    if len(all_combinations) == 0 or new_combo != all_combinations[-1]:
+                        all_combinations.append(new_combo)
+                    length = len(all_combinations)
+                    coin_change_v2(rest_coins, remainder, tmp_ret + (biggest_coin,) * i)
+                    if len(all_combinations) == length:
+                        break
+                else:
+                    all_combinations.append(new_combo)
+            elif remainder > 0:
+                coin_change_v2(rest_coins, remainder, tmp_ret + (biggest_coin,) * i)
+
+    coin_num = Counter(coins)
+    unique_coins = sorted(coin_num.keys(), reverse=True)  # in descending order
+    all_combinations = []
+    if unique_coins[0] * coin_num[unique_coins[0]] < amount:
+        coin_change_v2(unique_coins, amount)
+    else:
+        coin_change(unique_coins, amount)
+    return all_combinations
diff --git a/egs2/aishell4/enh1/local/split_train_dev_by_column.py b/egs2/aishell4/enh1/local/split_train_dev_by_column.py
new file mode 100755
index 00000000000..ff50a9407a7
--- /dev/null
+++ b/egs2/aishell4/enh1/local/split_train_dev_by_column.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python
+
+# Copyright 2022  Shanghai Jiao Tong University (Authors: Wangyou Zhang)
+# Apache 2.0
+import argparse
+from collections import defaultdict
+from pathlib import Path
+import random
+
+from split_train_dev import int_or_float_or_numstr
+from split_train_dev import split_train_dev
+from split_train_dev import split_train_dev_v2
+
+
+def get_parser():
+    """Argument parser."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("datalist", type=str, help="Path to the list of audio files")
+    parser.add_argument(
+        "--num_dev",
+        type=int_or_float_or_numstr,
+        required=True,
+        help="Number of samples to assign to the development set "
+        "(can be an integer, a float number, or a numeric string like '1/3')",
+    )
+    parser.add_argument(
+        "--outfile",
+        type=str,
+        default="{}.lst",
+        help="A template path for storing output",
+    )
+    parser.add_argument(
+        "--column_delim",
+        type=str,
+        default=None,
+        help="Delimiter for splitting columns in each file line",
+    )
+    parser.add_argument(
+        "--mode",
+        type=str,
+        default=True,
+        choices=("same_size_group", "similar_size_group"),
+        help="'same_size_group': sample from same-size groups "
+        "to gather `num_dev` samples\n"
+        "'similar_size_group': sample from similar-size groups "
+        "to gather `num_dev` samples",
+    )
+    parser.add_argument(
+        "--allowed_deviation",
+        type=int,
+        default=0,
+        help="How many samples are allowed for the final dev split to be less than "
+        "or more than the specified `num_dev` (only for mode='similar_size_group')",
+    )
+    parser.add_argument(
+        "--max_solutions",
+        type=int,
+        default=50,
+        help="Maximum number of possible coin change solutions to search (only for "
+        "mode='similar_size_group')",
+    )
+    parser.add_argument("--seed", type=int, default=1, help="random seed")
+    return parser
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+    random.seed(args.seed)
+
+    datalist = Path(args.datalist).expanduser().resolve()
+    all_data = defaultdict(list)
+    with datalist.open("r") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            # e.g. 6JXXQGX5Osg_30.000.wav /m/02y_763
+            if args.column_delim is None or args.column_delim == "":
+                fpath, group_id = line.split(maxsplit=1)
+            else:
+                fpath, group_id = line.split(args.column_delim, maxsplit=1)
+            all_data[group_id].append(fpath + "\n")
+
+    if args.mode == "same_size_group":
+        split_train_dev(
+            all_data,
+            args.num_dev,
+            args.outfile,
+        )
+    elif args.mode == "similar_size_group":
+        split_train_dev_v2(
+            all_data,
+            args.num_dev,
+            args.outfile,
+            allowed_deviation=args.allowed_deviation,
+            max_solutions=args.max_solutions,
+        )
+    else:
+        raise ValueError("Unsupported mode: %s" % args.mdoe)
diff --git a/egs2/aishell4/enh1/local/split_train_dev_by_prefix.py b/egs2/aishell4/enh1/local/split_train_dev_by_prefix.py
new file mode 100755
index 00000000000..c04cfb1a584
--- /dev/null
+++ b/egs2/aishell4/enh1/local/split_train_dev_by_prefix.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python
+
+# Copyright 2022  Shanghai Jiao Tong University (Authors: Wangyou Zhang)
+# Apache 2.0
+import argparse
+from collections import defaultdict
+from pathlib import Path
+import random
+
+from split_train_dev import int_or_float_or_numstr
+from split_train_dev import split_train_dev
+from split_train_dev import split_train_dev_v2
+
+
+def get_parser():
+    """Argument parser."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("datalist", type=str, help="Path to the list of audio files")
+    parser.add_argument(
+        "--num_dev",
+        type=int_or_float_or_numstr,
+        required=True,
+        help="Number of samples to assign to the development set "
+        "(can be an integer, a float number, or a numeric string like '1/3')",
+    )
+    parser.add_argument(
+        "--outfile",
+        type=str,
+        default="{}.lst",
+        help="A template path for storing output",
+    )
+    parser.add_argument(
+        "--delim",
+        type=str,
+        default="_",
+        help="Delimiter for determining the prefix for grouping",
+    )
+    parser.add_argument(
+        "--prefix_num",
+        type=int,
+        default=3,
+        help="Number of preceding splits (after delimiting) to form the prefix",
+    )
+    parser.add_argument(
+        "--mode",
+        type=str,
+        default=True,
+        choices=("same_size_group", "similar_size_group"),
+        help="'same_size_group': sample from same-size groups "
+        "to gather `num_dev` samples\n"
+        "'similar_size_group': sample from similar-size groups "
+        "to gather `num_dev` samples",
+    )
+    parser.add_argument(
+        "--allowed_deviation",
+        type=int,
+        default=0,
+        help="how many samples are allowed for the final dev split to be less than "
+        "or more than the specified `num_dev`",
+    )
+    parser.add_argument(
+        "--max_solutions",
+        type=int,
+        default=50,
+        help="Maximum number of possible coin change solutions to search (only for "
+        "mode='similar_size_group')",
+    )
+    parser.add_argument("--seed", type=int, default=1, help="random seed")
+    return parser
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+    random.seed(args.seed)
+
+    datalist = Path(args.datalist).expanduser().resolve()
+    all_data = defaultdict(list)
+    with datalist.open("r") as f:
+        for fpath in f:
+            if not fpath:
+                continue
+            # e.g. 3.00_7.76_3.00_1.06_3.47_289.5578_240.7477_0.7049
+            fname = Path(fpath).stem
+            group_id = args.delim.join(fname.split(args.delim)[: args.prefix_num])
+            all_data[group_id].append(fpath)
+
+    if args.mode == "same_size_group":
+        split_train_dev(
+            all_data,
+            args.num_dev,
+            args.outfile,
+        )
+    elif args.mode == "similar_size_group":
+        split_train_dev_v2(
+            all_data,
+            args.num_dev,
+            args.outfile,
+            allowed_deviation=args.allowed_deviation,
+            max_solutions=args.max_solutions,
+        )
+    else:
+        raise ValueError("Unsupported mode: %s" % args.mdoe)
diff --git a/egs2/aishell4/enh1/path.sh b/egs2/aishell4/enh1/path.sh
new file mode 120000
index 00000000000..eb217d35673
--- /dev/null
+++ b/egs2/aishell4/enh1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/enh1/path.sh
\ No newline at end of file
diff --git a/egs2/aishell4/enh1/pyscripts b/egs2/aishell4/enh1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/aishell4/enh1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/aishell4/enh1/run.sh b/egs2/aishell4/enh1/run.sh
new file mode 100755
index 00000000000..16cc3f82f63
--- /dev/null
+++ b/egs2/aishell4/enh1/run.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+outdir=aishell4_simu
+sample_rate=16k
+
+
+train_set="train"
+valid_set="dev"
+test_sets="test"
+
+./enh.sh \
+    --audio_format wav \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --fs "${sample_rate}" \
+    --lang zh \
+    --ngpu 1 \
+    --local_data_opts "--outdir ${outdir}" \
+    --enh_config conf/tuning/train_enh_beamformer_no_wpe.yaml \
+    "$@"
diff --git a/egs2/aishell4/enh1/scripts b/egs2/aishell4/enh1/scripts
new file mode 120000
index 00000000000..9aeb3f26509
--- /dev/null
+++ b/egs2/aishell4/enh1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/enh1/scripts
\ No newline at end of file
diff --git a/egs2/aishell4/enh1/steps b/egs2/aishell4/enh1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/aishell4/enh1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/aishell4/enh1/utils b/egs2/aishell4/enh1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/aishell4/enh1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/bur_openslr80/asr1/README.md b/egs2/bur_openslr80/asr1/README.md
new file mode 100644
index 00000000000..fdbdfc565f9
--- /dev/null
+++ b/egs2/bur_openslr80/asr1/README.md
@@ -0,0 +1,30 @@
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+# RESULTS
+## Environments
+- date: `Mon Mar 21 22:59:35 UTC 2022`
+- python version: `3.9.7 (default, Sep 16 2021, 13:09:58)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.10.1`
+- Git hash: `7ae4efd81778436a98b822483e8123adba6aa430`
+  - Commit date: `Tue Mar 15 20:11:18 2022 -0400`
+
+## asr_train_asr_hubert_transformer_adam_specaug_raw_bpe150
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_batch_size1_lm_lm_train_lm_bpe150_valid.loss.ave_asr_model_valid.acc.best/bur_test|480|4227|39.1|50.4|10.5|6.1|67.0|99.8|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_batch_size1_lm_lm_train_lm_bpe150_valid.loss.ave_asr_model_valid.acc.best/bur_test|480|33345|82.2|7.6|10.1|3.6|21.4|99.8|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_batch_size1_lm_lm_train_lm_bpe150_valid.loss.ave_asr_model_valid.acc.best/bur_test|480|18237|70.7|17.7|11.6|2.5|31.8|99.8|
+
+Link to model: https://huggingface.co/espnet/bur_openslr80_hubert
diff --git a/egs2/bur_openslr80/asr1/asr.sh b/egs2/bur_openslr80/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/bur_openslr80/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/bur_openslr80/asr1/cmd.sh b/egs2/bur_openslr80/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/bur_openslr80/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/bur_openslr80/asr1/conf/decode_asr.yaml b/egs2/bur_openslr80/asr1/conf/decode_asr.yaml
new file mode 100644
index 00000000000..9c03d913dad
--- /dev/null
+++ b/egs2/bur_openslr80/asr1/conf/decode_asr.yaml
@@ -0,0 +1,7 @@
+batch_size: 1
+beam_size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.5
+lm_weight: 0.3
\ No newline at end of file
diff --git a/egs2/bur_openslr80/asr1/conf/fbank.conf b/egs2/bur_openslr80/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/bur_openslr80/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/bur_openslr80/asr1/conf/pbs.conf b/egs2/bur_openslr80/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/bur_openslr80/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/bur_openslr80/asr1/conf/pitch.conf b/egs2/bur_openslr80/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/bur_openslr80/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/bur_openslr80/asr1/conf/queue.conf b/egs2/bur_openslr80/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/bur_openslr80/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/bur_openslr80/asr1/conf/slurm.conf b/egs2/bur_openslr80/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/bur_openslr80/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/bur_openslr80/asr1/conf/train_asr.yaml b/egs2/bur_openslr80/asr1/conf/train_asr.yaml
new file mode 100644
index 00000000000..e0d47c943f2
--- /dev/null
+++ b/egs2/bur_openslr80/asr1/conf/train_asr.yaml
@@ -0,0 +1,51 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: "conv2d"
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+
+# decoder related
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: "embed"
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# minibatch related
+batch_type: numel
+batch_bins: 200000
+
+# optimization related
+optim: adam
+accum_grad: 20
+grad_clip: 5
+patience: 20
+max_epoch: 50
+optim_conf:
+    lr: 10
+scheduler: noamlr
+scheduler_conf:
+    warmup_steps: 25000
+
+# others:
+best_model_criterion:
+-  - valid
+   - acc
+   - max
+keep_nbest_models: 10
+
+init: xavier_uniform
diff --git a/egs2/bur_openslr80/asr1/conf/train_asr_hubert_transformer_adam_specaug.yaml b/egs2/bur_openslr80/asr1/conf/train_asr_hubert_transformer_adam_specaug.yaml
new file mode 100755
index 00000000000..51376ed97c0
--- /dev/null
+++ b/egs2/bur_openslr80/asr1/conf/train_asr_hubert_transformer_adam_specaug.yaml
@@ -0,0 +1,74 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+optim: adam
+optim_conf:
+    lr: 0.0002
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
+max_epoch: 200
+keep_nbest_models: 5
+
+freeze_param: [
+"frontend.upstream"
+]
+
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+frontend: s3prl
+frontend_conf:
+    frontend_conf:
+        upstream: hubert_large_ll60k  # Note: If the upstream is changed, please change the input_size in the preencoder.
+    download_dir: ./hub
+    multilayer_feature: True
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 80
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/bur_openslr80/asr1/conf/train_lm.yaml b/egs2/bur_openslr80/asr1/conf/train_lm.yaml
new file mode 100644
index 00000000000..439d1f72c3b
--- /dev/null
+++ b/egs2/bur_openslr80/asr1/conf/train_lm.yaml
@@ -0,0 +1,14 @@
+lm_conf:
+    nlayers: 2
+    unit: 650
+optim: sgd        # or adam
+batch_type: folded
+batch_size: 64   # batch size in LM training
+max_epoch: 20      # if the data size is large, we can reduce this
+patience: 3
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
\ No newline at end of file
diff --git a/egs2/bur_openslr80/asr1/db.sh b/egs2/bur_openslr80/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/bur_openslr80/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/bur_openslr80/asr1/local/data.sh b/egs2/bur_openslr80/asr1/local/data.sh
new file mode 100755
index 00000000000..6615661767f
--- /dev/null
+++ b/egs2/bur_openslr80/asr1/local/data.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+# general configuration
+stage=0       # start from 0 if you need to start from data preparation
+stop_stage=1
+# inclusive, was 100
+SECONDS=0
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+. utils/parse_options.sh
+
+log "data preparation started"
+mkdir -p ${BURMESE}
+if [ -z "${BURMESE}" ]; then
+    log "Fill the value of 'BURMESE' of db.sh"
+    exit 1
+fi
+
+workspace=$PWD
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "sub-stage 0: Download Data to downloads"
+    cd ${BURMESE}
+    wget https://us.openslr.org/resources/80/my_mm_female.zip
+    unzip -o my_mm_female.zip
+    rm -f my_mm_female.zip
+
+    cd $workspace
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "sub-stage 1: Preparing Data for openslr"
+
+    python3 local/data_prep.py -d ${BURMESE}
+    utils/spk2utt_to_utt2spk.pl data/bur_train/spk2utt > data/bur_train/utt2spk
+    utils/spk2utt_to_utt2spk.pl data/bur_dev/spk2utt > data/bur_dev/utt2spk
+    utils/spk2utt_to_utt2spk.pl data/bur_test/spk2utt > data/bur_test/utt2spk
+    utils/fix_data_dir.sh data/bur_train
+    utils/fix_data_dir.sh data/bur_dev
+    utils/fix_data_dir.sh data/bur_test
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/bur_openslr80/asr1/local/data_prep.py b/egs2/bur_openslr80/asr1/local/data_prep.py
new file mode 100644
index 00000000000..98180ea4b2e
--- /dev/null
+++ b/egs2/bur_openslr80/asr1/local/data_prep.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python3
+
+# Copyright 2021 Carnegie Mellon University (Peter Wu)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+
+import argparse
+import os
+import random
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-d", help="downloads directory", type=str, default="downloads")
+    args = parser.parse_args()
+
+    tsv_path = "%s/line_index.tsv" % args.d
+
+    with open(tsv_path, "r") as inf:
+        tsv_lines = inf.readlines()
+    tsv_lines = [line.strip() for line in tsv_lines]
+
+    spk2utt = {}
+    utt2text = {}
+    for line in tsv_lines:
+        l_list = line.split("\t")
+        fid = l_list[0]
+        spk = fid[:8]
+        text = l_list[1]
+        path = "%s/%s.wav" % (args.d, fid)
+        if os.path.exists(path):
+            utt2text[fid] = text
+            if spk in spk2utt:
+                spk2utt[spk].append(fid)
+            else:
+                spk2utt[spk] = [fid]
+
+    spks = sorted(list(spk2utt.keys()))
+    print(spks)
+    num_fids = 0
+    num_test_spks = 0
+    for spk in spks:
+        num_test_spks += 1
+        fids = sorted(list(set(spk2utt[spk])))
+        num_fids += len(fids)
+        if num_fids >= 500:
+            break
+    test_spks = spks[:num_test_spks]
+    train_dev_spks = spks[num_test_spks:]
+    random.Random(0).shuffle(train_dev_spks)
+    num_train = int(len(train_dev_spks) * 0.9)
+    train_spks = train_dev_spks[:num_train]
+    dev_spks = train_dev_spks[num_train:]
+
+    spks_by_phase = {"train": train_spks, "dev": dev_spks, "test": test_spks}
+    flac_dir = "%s" % args.d
+    sr = 16000
+    for phase in spks_by_phase:
+        spks = spks_by_phase[phase]
+        if type(spks) is not list:
+            spks = [spks]
+        text_strs = []
+        wav_scp_strs = []
+        spk2utt_strs = []
+        num_fids = 0
+        for spk in spks:
+            fids = sorted(list(set(spk2utt[spk])))
+            num_fids += len(fids)
+            if phase == "test" and num_fids > 500:
+                curr_num_fids = num_fids - 500
+                random.Random(1).shuffle(fids)
+                fids = fids[:curr_num_fids]
+            utts = [spk + "-" + f for f in fids]
+            utts_str = " ".join(utts)
+            spk2utt_strs.append("%s %s" % (spk, utts_str))
+            for fid, utt in zip(fids, utts):
+                cmd = "ffmpeg -i %s/%s.wav -f wav -ar %d -ab 16 -ac 1 - |" % (
+                    flac_dir,
+                    fid,
+                    sr,
+                )
+                text_strs.append("%s %s" % (utt, utt2text[fid]))
+                wav_scp_strs.append("%s %s" % (utt, cmd))
+        phase_dir = "data/bur_%s" % phase
+        if not os.path.exists(phase_dir):
+            os.makedirs(phase_dir)
+        text_strs = sorted(text_strs)
+        wav_scp_strs = sorted(wav_scp_strs)
+        spk2utt_strs = sorted(spk2utt_strs)
+        with open(os.path.join(phase_dir, "text"), "w+") as ouf:
+            for s in text_strs:
+                ouf.write("%s\n" % s)
+        with open(os.path.join(phase_dir, "wav.scp"), "w+") as ouf:
+            for s in wav_scp_strs:
+                ouf.write("%s\n" % s)
+        with open(os.path.join(phase_dir, "spk2utt"), "w+") as ouf:
+            for s in spk2utt_strs:
+                ouf.write("%s\n" % s)
diff --git a/egs2/bur_openslr80/asr1/local/path.sh b/egs2/bur_openslr80/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/bur_openslr80/asr1/path.sh b/egs2/bur_openslr80/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/bur_openslr80/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/bur_openslr80/asr1/pyscripts b/egs2/bur_openslr80/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/bur_openslr80/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/bur_openslr80/asr1/run.sh b/egs2/bur_openslr80/asr1/run.sh
new file mode 100755
index 00000000000..970c3594294
--- /dev/null
+++ b/egs2/bur_openslr80/asr1/run.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+lid=false # whether to use language id as additional label
+
+train_set="bur_train"
+train_dev="bur_dev"
+test_set="bur_test"
+
+asr_config=conf/train_asr_hubert_transformer_adam_specaug.yaml
+inference_config=conf/decode_asr.yaml
+lm_config=conf/train_lm.yaml 
+ngpu=1
+
+./asr.sh \
+    --stage 1 \
+    --stop_stage 100 \
+    --ngpu ${ngpu} \
+    --nj 80 \
+    --inference_nj 256 \
+    --inference_asr_model valid.acc.best.pth \
+    --gpu_inference false \
+    --inference_args "--batch_size 1" \
+    --use_lm true \
+    --token_type bpe \
+    --nbpe 150 \
+    --feats_type raw \
+    --feats_normalize uttmvn \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --lm_config "${lm_config}"\
+    --train_set "${train_set}" \
+    --valid_set "${train_dev}" \
+    --test_sets "${test_set}" \
+    --lm_train_text "data/${train_set}/text" \
+    --lm_dev_text "data/${train_dev}/text" \
+    --lm_test_text "data/${test_set}/text" \
+    --local_score_opts "--score_lang_id ${lid}" "$@"
diff --git a/egs2/bur_openslr80/asr1/scripts b/egs2/bur_openslr80/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/bur_openslr80/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/bur_openslr80/asr1/steps b/egs2/bur_openslr80/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/bur_openslr80/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/bur_openslr80/asr1/utils b/egs2/bur_openslr80/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/bur_openslr80/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/chime4/asr1/conf/tuning/train_asr_transformer_wavlm_lr1e-3_specaug_accum1_preenc128_warmup20k.yaml b/egs2/chime4/asr1/conf/tuning/train_asr_transformer_wavlm_lr1e-3_specaug_accum1_preenc128_warmup20k.yaml
new file mode 100644
index 00000000000..cee2e0c896d
--- /dev/null
+++ b/egs2/chime4/asr1/conf/tuning/train_asr_transformer_wavlm_lr1e-3_specaug_accum1_preenc128_warmup20k.yaml
@@ -0,0 +1,90 @@
+# minibatch related
+batch_type: folded
+batch_size: 32
+accum_grad: 1
+grad_clip: 5
+max_epoch: 50
+patience: none
+# The initialization method for model parameters
+init: xavier_uniform
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+unused_parameters: true
+# SSL-based frontend is fixed during training for training efficiency,
+# however, the gradients are backprogated through frontend to the enhancement.
+freeze_param: [
+    "frontend.upstream"
+]
+
+# network architecture
+frontend: s3prl
+frontend_conf:
+    frontend_conf:
+        upstream: wavlm_large  # Note: If the upstream is changed, please change the input_size in the preencoder.
+    download_dir: ./hub
+    multilayer_feature: True
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 128
+
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d2
+    normalize_before: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: embed
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.0
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false
+
+optim: adam
+optim_conf:
+    lr: 0.001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 20000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 100
+    num_freq_mask: 4
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
diff --git a/egs2/chime4/enh1/conf/tuning/train_enh_convtasnet_small.yaml b/egs2/chime4/enh1/conf/tuning/train_enh_convtasnet_small.yaml
new file mode 100644
index 00000000000..7c73d4c868f
--- /dev/null
+++ b/egs2/chime4/enh1/conf/tuning/train_enh_convtasnet_small.yaml
@@ -0,0 +1,64 @@
+optim: adam
+init: xavier_uniform
+max_epoch: 100
+batch_type: folded
+batch_size: 32
+iterator_type: chunk
+chunk_length: 32000
+num_workers: 4
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 1.0e-05
+patience: 4
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: reducelronplateau
+scheduler_conf:
+    mode: min
+    factor: 0.5
+    patience: 3
+model_conf:
+    loss_type: si_snr
+encoder: conv
+encoder_conf:
+    channel: 256
+    kernel_size: 40
+    stride: 20
+decoder: conv
+decoder_conf:
+    channel: 256
+    kernel_size: 40
+    stride: 20
+separator: tcn
+separator_conf:
+    num_spk: 1
+    layer: 4
+    stack: 2
+    bottleneck_dim: 256
+    hidden_dim: 512
+    kernel: 3
+    causal: False
+    norm_type: "gLN"
+    nonlinear: relu
+criterions:
+  # The first criterion
+  - name: si_snr
+    conf:
+        eps: 1e-7
+    # the wrapper for the current criterion
+    # for single-talker case, we simplely use fixed_order wrapper
+    wrapper:
+      - type: fixed_order
+        wrapper_conf:
+          weight: 1.0
+
diff --git a/egs2/chime4/enh_asr1/README.md b/egs2/chime4/enh_asr1/README.md
new file mode 100644
index 00000000000..f01c087f211
--- /dev/null
+++ b/egs2/chime4/enh_asr1/README.md
@@ -0,0 +1,97 @@
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+# RESULTS
+## Environments
+- date: `Thu Apr 28 00:09:17 EDT 2022`
+- python version: `3.7.11 (default, Jul 27 2021, 14:32:16)  [GCC 7.5.0]`
+- espnet version: `espnet 202204`
+- pytorch version: `pytorch 1.8.1`
+- Git hash: `44971ff962aae30c962226f1ba3d87de057ac00e`
+  - Commit date: `Wed Apr 27 10:13:03 2022 -0400`
+
+## enh_asr_train_enh_asr_convtasnet_init_noenhloss_wavlm_transformer_init_lr1e-4_accum1_adam_specaug_bypass0_raw_en_char
+- Pretrained model: https://huggingface.co/espnet/simpleoier_chime4_enh_asr_convtasnet_init_noenhloss_wavlm_transformer_init_raw_en_char
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/dt05_real_isolated_1ch_track|1640|27119|98.3|1.3|0.4|0.2|1.9|21.8|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/dt05_real_beamformit_2mics|1640|27119|98.5|1.2|0.3|0.2|1.7|19.6|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/dt05_real_beamformit_5mics|1640|27119|98.6|1.1|0.3|0.2|1.5|18.7|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/dt05_simu_isolated_1ch_track|1640|27120|97.2|2.1|0.7|0.3|3.1|28.9|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/dt05_simu_beamformit_2mics|1640|27120|97.9|1.5|0.5|0.2|2.3|25.2|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/dt05_simu_beamformit_5mics|1640|27120|98.4|1.2|0.4|0.1|1.7|19.9|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/et05_real_isolated_1ch_track|1320|21409|96.7|2.6|0.7|0.4|3.7|31.6|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/et05_real_beamformit_2mics|1320|21409|97.4|2.0|0.6|0.3|2.9|27.3|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/et05_real_beamformit_5mics|1320|21409|97.8|1.8|0.4|0.2|2.5|24.3|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/et05_simu_isolated_1ch_track|1320|21416|94.6|3.7|1.6|0.5|5.9|37.3|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/et05_simu_beamformit_2mics|1320|21416|96.6|2.5|1.0|0.3|3.7|32.5|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/et05_simu_beamformit_5mics|1320|21416|97.5|1.9|0.7|0.3|2.9|28.9|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/dt05_real_isolated_1ch_track|1640|160390|99.4|0.2|0.4|0.2|0.8|21.8|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/dt05_real_beamformit_2mics|1640|160390|99.5|0.2|0.3|0.2|0.7|19.6|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/dt05_real_beamformit_5mics|1640|160390|99.6|0.1|0.3|0.2|0.6|18.7|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/dt05_simu_isolated_1ch_track|1640|160400|98.8|0.5|0.7|0.3|1.5|28.9|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/dt05_simu_beamformit_2mics|1640|160400|99.2|0.3|0.5|0.2|1.1|25.2|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/dt05_simu_beamformit_5mics|1640|160400|99.5|0.2|0.3|0.1|0.7|19.9|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/et05_real_isolated_1ch_track|1320|126796|98.6|0.6|0.8|0.4|1.8|31.7|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/et05_real_beamformit_2mics|1320|126796|98.9|0.4|0.7|0.3|1.4|27.3|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/et05_real_beamformit_5mics|1320|126796|99.1|0.4|0.5|0.2|1.1|24.3|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/et05_simu_isolated_1ch_track|1320|126812|97.0|1.2|1.9|0.6|3.7|37.3|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/et05_simu_beamformit_2mics|1320|126812|98.2|0.6|1.1|0.4|2.1|32.5|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/et05_simu_beamformit_5mics|1320|126812|98.8|0.4|0.8|0.3|1.5|28.9|
+
+### Enhancement
+
+|dataset|STOI|SDR|SI_SNR|
+|---|---|---|---|
+|dt05_simu_isolated_1ch_track|0.86|4.97|1.77|
+|et05_simu_isolated_1ch_track|0.85|5.45|0.88|
+
+
+## enh_asr_train_enh_asr_convtasnet_fbank_transformer_raw_en_char
+- Pretrained model: https://huggingface.co/espnet/simpleoier_chime4_enh_asr_train_enh_asr_convtasnet_fbank_transformer_raw_en_char
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/dt05_real_isolated_1ch_track|1640|27119|91.8|6.0|2.2|0.8|9.0|57.7|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/dt05_real_beamformit_2mics|1640|27119|93.0|5.2|1.8|0.6|7.7|53.3|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/dt05_real_beamformit_5mics|1640|27119|93.9|4.5|1.6|0.5|6.7|49.9|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/dt05_simu_isolated_1ch_track|1640|27120|89.9|7.6|2.4|1.0|11.1|59.7|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/dt05_simu_beamformit_2mics|1640|27120|92.2|6.0|1.9|0.7|8.6|55.5|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/dt05_simu_beamformit_5mics|1640|27120|93.6|4.9|1.5|0.6|7.1|51.6|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/et05_real_isolated_1ch_track|1320|21409|84.6|11.4|4.0|1.5|17.0|69.4|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/et05_real_beamformit_2mics|1320|21409|86.7|9.7|3.5|1.3|14.5|64.7|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/et05_real_beamformit_5mics|1320|21409|89.2|7.9|2.9|1.0|11.8|61.2|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/et05_simu_isolated_1ch_track|1320|21416|82.8|13.1|4.1|1.9|19.1|69.4|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/et05_simu_beamformit_2mics|1320|21416|86.0|10.5|3.5|1.5|15.5|67.5|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/et05_simu_beamformit_5mics|1320|21416|88.1|8.9|3.1|1.2|13.1|64.8|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/dt05_real_isolated_1ch_track|1640|160390|95.9|1.7|2.3|0.8|4.8|57.7|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/dt05_real_beamformit_2mics|1640|160390|96.6|1.4|2.0|0.6|4.0|53.3|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/dt05_real_beamformit_5mics|1640|160390|97.1|1.1|1.8|0.5|3.4|49.9|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/dt05_simu_isolated_1ch_track|1640|160400|94.7|2.5|2.9|1.0|6.3|59.7|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/dt05_simu_beamformit_2mics|1640|160400|95.9|1.7|2.3|0.7|4.8|55.5|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/dt05_simu_beamformit_5mics|1640|160400|96.8|1.4|1.9|0.6|3.8|51.6|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/et05_real_isolated_1ch_track|1320|126796|91.5|3.8|4.6|1.6|10.0|69.4|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/et05_real_beamformit_2mics|1320|126796|92.8|3.2|4.0|1.2|8.4|64.7|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/et05_real_beamformit_5mics|1320|126796|94.3|2.4|3.3|1.0|6.6|61.2|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/et05_simu_isolated_1ch_track|1320|126812|90.3|4.8|4.9|2.2|11.9|69.4|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/et05_simu_beamformit_2mics|1320|126812|92.2|3.5|4.2|1.7|9.5|67.5|
+|decode_asr_transformer_normalize_output_wavtrue_lm_lm_train_lm_transformer_en_char_valid.loss.ave_enh_asr_model_valid.acc.ave/et05_simu_beamformit_5mics|1320|126812|93.7|2.7|3.5|1.4|7.7|64.8|
+
+### Enhancement
+
+|dataset|STOI|SDR|SI_SNR|
+|---|---|---|---|
+|dt05_simu_isolated_1ch_track|0.87|7.14|4.51|
+|et05_simu_isolated_1ch_track|0.85|7.47|3.02|
diff --git a/egs2/chime4/enh_asr1/cmd.sh b/egs2/chime4/enh_asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/chime4/enh_asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/chime4/enh_asr1/conf/chime4.cfg b/egs2/chime4/enh_asr1/conf/chime4.cfg
new file mode 120000
index 00000000000..5b3477ab5c6
--- /dev/null
+++ b/egs2/chime4/enh_asr1/conf/chime4.cfg
@@ -0,0 +1 @@
+../../asr1/conf/chime4.cfg
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/conf/decode_asr_transformer.yaml b/egs2/chime4/enh_asr1/conf/decode_asr_transformer.yaml
new file mode 100644
index 00000000000..8e7518150a7
--- /dev/null
+++ b/egs2/chime4/enh_asr1/conf/decode_asr_transformer.yaml
@@ -0,0 +1,7 @@
+batch_size: 0
+beam-size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc-weight: 0.3
+lm-weight: 1.0
diff --git a/egs2/chime4/enh_asr1/conf/fbank.conf b/egs2/chime4/enh_asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/chime4/enh_asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/chime4/enh_asr1/conf/pbs.conf b/egs2/chime4/enh_asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/chime4/enh_asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/chime4/enh_asr1/conf/pitch.conf b/egs2/chime4/enh_asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/chime4/enh_asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/chime4/enh_asr1/conf/queue.conf b/egs2/chime4/enh_asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/chime4/enh_asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/chime4/enh_asr1/conf/slurm.conf b/egs2/chime4/enh_asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/chime4/enh_asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/chime4/enh_asr1/conf/train_enh_asr_convtasnet_fbank_transformer.yaml b/egs2/chime4/enh_asr1/conf/train_enh_asr_convtasnet_fbank_transformer.yaml
new file mode 120000
index 00000000000..920b436ba58
--- /dev/null
+++ b/egs2/chime4/enh_asr1/conf/train_enh_asr_convtasnet_fbank_transformer.yaml
@@ -0,0 +1 @@
+tuning/train_enh_asr_convtasnet_si_snr_fbank_transformer_lr2e-3_accum2_warmup20k_specaug.yaml
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/conf/train_lm_transformer.yaml b/egs2/chime4/enh_asr1/conf/train_lm_transformer.yaml
new file mode 100644
index 00000000000..a502a55381a
--- /dev/null
+++ b/egs2/chime4/enh_asr1/conf/train_lm_transformer.yaml
@@ -0,0 +1,48 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: conv2d
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: embed
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# optimization related
+optim: adam
+accum_grad: 2
+grad_clip: 5
+patience: 10
+max_epoch: 100
+optim_conf:
+    lr: 0.005
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 20000
+
+# minibatch related
+batch_type: folded
+batch_size: 32
+
+# criterion
+best_model_criterion:
+-  - valid
+   - acc
+   - max
+keep_nbest_models: 10
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/conf/tuning/train_enh_asr_convtasnet_init_noenhloss_wavlm_transformer_init_lr1e-4_accum1_adam_specaug_bypass0.yaml b/egs2/chime4/enh_asr1/conf/tuning/train_enh_asr_convtasnet_init_noenhloss_wavlm_transformer_init_lr1e-4_accum1_adam_specaug_bypass0.yaml
new file mode 100644
index 00000000000..1eb24dd8134
--- /dev/null
+++ b/egs2/chime4/enh_asr1/conf/tuning/train_enh_asr_convtasnet_init_noenhloss_wavlm_transformer_init_lr1e-4_accum1_adam_specaug_bypass0.yaml
@@ -0,0 +1,124 @@
+# minibatch related
+batch_type: folded
+batch_size: 16  # A6000 x 1
+accum_grad: 1
+grad_clip: 5
+max_epoch: 12
+patience: 10
+# The initialization method for model parameters
+init: xavier_uniform
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+-   - train
+    - loss
+    - min
+keep_nbest_models: 10
+num_att_plot: 3
+unused_parameters: true
+freeze_param: [
+    "s2t_model.frontend.upstream",
+]
+init_param: [
+    "../enh1/exp/enh_train_enh_convtasnet_small_raw/valid.loss.ave_1best.pth:encoder:enh_model.encoder",
+    "../enh1/exp/enh_train_enh_convtasnet_small_raw/valid.loss.ave_1best.pth:separator:enh_model.separator",
+    "../enh1/exp/enh_train_enh_convtasnet_small_raw/valid.loss.ave_1best.pth:decoder:enh_model.decoder",
+    "../asr1/exp/asr_train_asr_transformer_wavlm_lr1e-3_specaug_accum1_preenc128_warmup20k_raw_en_char/valid.acc.ave.pth:frontend:s2t_model.frontend",
+    "../asr1/exp/asr_train_asr_transformer_wavlm_lr1e-3_specaug_accum1_preenc128_warmup20k_raw_en_char/valid.acc.ave.pth:preencoder:s2t_model.preencoder",
+    "../asr1/exp/asr_train_asr_transformer_wavlm_lr1e-3_specaug_accum1_preenc128_warmup20k_raw_en_char/valid.acc.ave.pth:encoder:s2t_model.encoder",
+    "../asr1/exp/asr_train_asr_transformer_wavlm_lr1e-3_specaug_accum1_preenc128_warmup20k_raw_en_char/valid.acc.ave.pth:ctc:s2t_model.ctc",
+    "../asr1/exp/asr_train_asr_transformer_wavlm_lr1e-3_specaug_accum1_preenc128_warmup20k_raw_en_char/valid.acc.ave.pth:decoder:s2t_model.decoder",
+]
+
+# network architecture
+enh_encoder: conv
+enh_encoder_conf:
+    channel: 256
+    kernel_size: 40
+    stride: 20
+enh_decoder: conv
+enh_decoder_conf:
+    channel: 256
+    kernel_size: 40
+    stride: 20
+enh_separator: tcn
+enh_separator_conf:
+    num_spk: 1
+    layer: 4
+    stack: 2
+    bottleneck_dim: 256
+    hidden_dim: 512
+    kernel: 3
+    causal: False
+    norm_type: "gLN"
+    nonlinear: relu
+
+frontend: s3prl
+frontend_conf:
+    frontend_conf:
+        upstream: wavlm_large  # Note: If the upstream is changed, please change the input_size in the preencoder.
+    download_dir: ./hub
+    multilayer_feature: true
+
+asr_preencoder: linear
+asr_preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 128
+
+# encoder related
+asr_encoder: transformer
+asr_encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d2
+    normalize_before: true
+
+# decoder related
+asr_decoder: transformer
+asr_decoder_conf:
+    input_layer: embed
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.0
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+asr_model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false
+
+model_conf:
+    calc_enh_loss: false
+    bypass_enh_prob: 0.0
+
+optim: adam
+optim_conf:
+    lr: 0.0001
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+        apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 100
+    num_freq_mask: 4
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/chime4/enh_asr1/conf/tuning/train_enh_asr_convtasnet_si_snr_fbank_transformer_lr2e-3_accum2_warmup20k_specaug.yaml b/egs2/chime4/enh_asr1/conf/tuning/train_enh_asr_convtasnet_si_snr_fbank_transformer_lr2e-3_accum2_warmup20k_specaug.yaml
new file mode 100644
index 00000000000..8e30e5edecb
--- /dev/null
+++ b/egs2/chime4/enh_asr1/conf/tuning/train_enh_asr_convtasnet_si_snr_fbank_transformer_lr2e-3_accum2_warmup20k_specaug.yaml
@@ -0,0 +1,119 @@
+# minibatch related
+batch_type: folded
+batch_size: 16  # A6000 x 1
+accum_grad: 2
+grad_clip: 5
+max_epoch: 50
+patience: 10
+# The initialization method for model parameters
+init: xavier_uniform
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+-   - train
+    - loss
+    - min
+keep_nbest_models: 10
+num_att_plot: 0
+
+# network architecture
+enh_encoder: conv
+enh_encoder_conf:
+    channel: 256
+    kernel_size: 40
+    stride: 20
+enh_decoder: conv
+enh_decoder_conf:
+    channel: 256
+    kernel_size: 40
+    stride: 20
+enh_separator: tcn
+enh_separator_conf:
+    num_spk: 1
+    layer: 4
+    stack: 2
+    bottleneck_dim: 256
+    hidden_dim: 512
+    kernel: 3
+    causal: False
+    norm_type: "gLN"
+    nonlinear: relu
+enh_criterions:
+  # The first criterion
+  - name: si_snr
+    conf:
+        eps: 1e-7
+    # the wrapper for the current criterion
+    # for single-talker case, we simplely use fixed_order wrapper
+    wrapper: fixed_order
+    wrapper_conf:
+        weight: 1.0
+
+frontend: default
+frontend_conf:
+    fs: 16000
+    n_fft: 512
+    win_length: 400
+    hop_length: 160
+    frontend_conf: null
+    apply_stft: True
+
+# encoder related
+asr_encoder: transformer
+asr_encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d
+    normalize_before: true
+
+# decoder related
+asr_decoder: transformer
+asr_decoder_conf:
+    input_layer: embed
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.0
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+asr_model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false
+
+model_conf:
+    bypass_enh_prob: 0.0
+
+optim: adam
+optim_conf:
+    lr: 0.002
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 20000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/db.sh b/egs2/chime4/enh_asr1/db.sh
new file mode 120000
index 00000000000..3090b1bc350
--- /dev/null
+++ b/egs2/chime4/enh_asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/enh_asr1/db.sh
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/enh_asr.sh b/egs2/chime4/enh_asr1/enh_asr.sh
new file mode 120000
index 00000000000..b00d9b13ef7
--- /dev/null
+++ b/egs2/chime4/enh_asr1/enh_asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/enh_asr1/enh_asr.sh
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/CHiME3_simulate_data_patched_parallel.m b/egs2/chime4/enh_asr1/local/CHiME3_simulate_data_patched_parallel.m
new file mode 120000
index 00000000000..8f939c2e007
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/CHiME3_simulate_data_patched_parallel.m
@@ -0,0 +1 @@
+../../enh1/local/CHiME3_simulate_data_patched_parallel.m
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/bth_chime4_data_prep.sh b/egs2/chime4/enh_asr1/local/bth_chime4_data_prep.sh
new file mode 120000
index 00000000000..f94db52c974
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/bth_chime4_data_prep.sh
@@ -0,0 +1 @@
+../../asr1/local/bth_chime4_data_prep.sh
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/chime4_asr_data.sh b/egs2/chime4/enh_asr1/local/chime4_asr_data.sh
new file mode 120000
index 00000000000..58fbb0a9212
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/chime4_asr_data.sh
@@ -0,0 +1 @@
+../../asr1/local/data.sh
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/chime4_enh_data.sh b/egs2/chime4/enh_asr1/local/chime4_enh_data.sh
new file mode 120000
index 00000000000..d30a4dc12a7
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/chime4_enh_data.sh
@@ -0,0 +1 @@
+../../enh1/local/data.sh
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/clean_chime4_format_data.sh b/egs2/chime4/enh_asr1/local/clean_chime4_format_data.sh
new file mode 120000
index 00000000000..4826e8e382a
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/clean_chime4_format_data.sh
@@ -0,0 +1 @@
+../../enh1/local/clean_chime4_format_data.sh
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/clean_wsj0_data_prep.sh b/egs2/chime4/enh_asr1/local/clean_wsj0_data_prep.sh
new file mode 120000
index 00000000000..5c61d4de024
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/clean_wsj0_data_prep.sh
@@ -0,0 +1 @@
+../../enh1/local/clean_wsj0_data_prep.sh
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/cstr_ndx2flist.pl b/egs2/chime4/enh_asr1/local/cstr_ndx2flist.pl
new file mode 120000
index 00000000000..50660a2b68e
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/cstr_ndx2flist.pl
@@ -0,0 +1 @@
+../../enh1/local/cstr_ndx2flist.pl
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/data.sh b/egs2/chime4/enh_asr1/local/data.sh
new file mode 100755
index 00000000000..dc36d70eae3
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/data.sh
@@ -0,0 +1,89 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+help_message=$(cat << EOF
+Usage: $0 --extra-annotations <path> [--stage <stage>] [--stop_stage <stop_stage>] [--nj <nj>]
+
+  required argument:
+    --extra-annotations: path to a directory containing extra annotations for CHiME4
+                         This is required for preparing et05_simu_isolated_1ch_track.
+    NOTE:
+        You can download it manually from
+            http://spandh.dcs.shef.ac.uk/chime_challenge/CHiME4/download.html
+        Then unzip the downloaded file to CHiME4_diff;
+        You will then find the extra annotations in CHiME4_diff/CHiME3/data/annotations
+
+  optional argument:
+    [--stage]: 1 (default) or 2
+    [--stop_stage]: 1 or 2 (default)
+    [--nj]: number of parallel pool workers in MATLAB
+EOF
+)
+
+
+stage=0
+stop_stage=100
+extra_annotations=
+local_data_opts=
+train_dev=dt05_multi_isolated_1ch_track
+log "$0 $*"
+. utils/parse_options.sh
+
+
+if [ $# -ne 0 ] || [ -z "${extra_annotations}" ]; then
+    echo "${help_message}"
+    exit 2
+fi
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "stage 0: Enh data preparation"
+    local/chime4_enh_data.sh --extra_annotations ${extra_annotations} ${local_data_opts}
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: ASR data preparation"
+    local/chime4_asr_data.sh --stage 0 --stop-stage 1 ${local_data_opts}
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: Enh_ASR data preparation: combine enh and asr data"
+
+    # dummy spk1.scp
+    for dset in tr05_real_noisy train_si284 dt05_real_isolated_1ch_track et05_real_isolated_1ch_track dt05_real_beamformit_2mics dt05_simu_beamformit_2mics et05_real_beamformit_2mics et05_simu_beamformit_2mics dt05_real_beamformit_5mics dt05_simu_beamformit_5mics et05_real_beamformit_5mics et05_simu_beamformit_5mics; do
+        cp data/${dset}/wav.scp data/${dset}/spk1.scp
+    done
+    cp data/tr05_simu_isolated_1ch_track/spk1.scp data/tr05_simu_noisy
+
+    # utt2category
+    <data/tr05_simu_noisy/wav.scp awk '{print($1, "SIMU")}' > data/tr05_simu_noisy/utt2category
+    <data/tr05_real_noisy/wav.scp awk '{print($1, "REAL")}' > data/tr05_real_noisy/utt2category
+    <data/train_si284/wav.scp awk '{print($1, "CLEAN")}' > data/train_si284/utt2category
+    <data/dt05_simu_isolated_1ch_track/wav.scp awk '{print($1, "SIMU")}' > data/dt05_simu_isolated_1ch_track/utt2category
+    <data/dt05_real_isolated_1ch_track/wav.scp awk '{print($1, "REAL")}' > data/dt05_real_isolated_1ch_track/utt2category
+
+    utils/combine_data.sh --extra_files "utt2category spk1.scp" \
+        data/tr05_multi_noisy data/tr05_simu_noisy data/tr05_real_noisy 
+    utils/combine_data.sh --extra_files "utt2category spk1.scp" \
+        data/tr05_multi_noisy_si284 data/tr05_multi_noisy data/train_si284
+    utils/combine_data.sh --extra_files "utt2category spk1.scp" data/${train_dev} \
+        data/dt05_simu_isolated_1ch_track data/dt05_real_isolated_1ch_track
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    log "stage 3: Srctexts preparation"
+    local/chime4_asr_data.sh --stage 2 --stop-stage 2
+fi
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/find_noisy_transcripts.pl b/egs2/chime4/enh_asr1/local/find_noisy_transcripts.pl
new file mode 120000
index 00000000000..ae475b3b32d
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/find_noisy_transcripts.pl
@@ -0,0 +1 @@
+../../enh1/local/find_noisy_transcripts.pl
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/find_transcripts.pl b/egs2/chime4/enh_asr1/local/find_transcripts.pl
new file mode 120000
index 00000000000..5e58a9d0c0e
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/find_transcripts.pl
@@ -0,0 +1 @@
+../../enh1/local/find_transcripts.pl
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/flist2scp.pl b/egs2/chime4/enh_asr1/local/flist2scp.pl
new file mode 120000
index 00000000000..c44f94660eb
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/flist2scp.pl
@@ -0,0 +1 @@
+../../enh1/local/flist2scp.pl
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/localize.m b/egs2/chime4/enh_asr1/local/localize.m
new file mode 120000
index 00000000000..f93a989f0ad
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/localize.m
@@ -0,0 +1 @@
+../../enh1/local/localize.m
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/make_stft.sh b/egs2/chime4/enh_asr1/local/make_stft.sh
new file mode 120000
index 00000000000..cf9038f4ea2
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/make_stft.sh
@@ -0,0 +1 @@
+../../asr1/local/make_stft.sh
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/ndx2flist.pl b/egs2/chime4/enh_asr1/local/ndx2flist.pl
new file mode 120000
index 00000000000..5f79e7991f9
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/ndx2flist.pl
@@ -0,0 +1 @@
+../../asr1/local/ndx2flist.pl
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/normalize_transcript.pl b/egs2/chime4/enh_asr1/local/normalize_transcript.pl
new file mode 120000
index 00000000000..1be067e3703
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/normalize_transcript.pl
@@ -0,0 +1 @@
+../../enh1/local/normalize_transcript.pl
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/path.sh b/egs2/chime4/enh_asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/chime4/enh_asr1/local/real_enhan_chime4_data_prep.sh b/egs2/chime4/enh_asr1/local/real_enhan_chime4_data_prep.sh
new file mode 120000
index 00000000000..13c906eba90
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/real_enhan_chime4_data_prep.sh
@@ -0,0 +1 @@
+../../asr1/local/real_enhan_chime4_data_prep.sh
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/real_ext_chime4_data_prep.sh b/egs2/chime4/enh_asr1/local/real_ext_chime4_data_prep.sh
new file mode 120000
index 00000000000..6620a1d2eb4
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/real_ext_chime4_data_prep.sh
@@ -0,0 +1 @@
+../../enh1/local/real_ext_chime4_data_prep.sh
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/real_noisy_chime4_data_prep.sh b/egs2/chime4/enh_asr1/local/real_noisy_chime4_data_prep.sh
new file mode 120000
index 00000000000..86d5a8cca3b
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/real_noisy_chime4_data_prep.sh
@@ -0,0 +1 @@
+../../enh1/local/real_noisy_chime4_data_prep.sh
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/run_beamform_2ch_track.sh b/egs2/chime4/enh_asr1/local/run_beamform_2ch_track.sh
new file mode 120000
index 00000000000..eb7894626ea
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/run_beamform_2ch_track.sh
@@ -0,0 +1 @@
+../../asr1/local/run_beamform_2ch_track.sh
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/run_beamform_6ch_track.sh b/egs2/chime4/enh_asr1/local/run_beamform_6ch_track.sh
new file mode 120000
index 00000000000..d8609c18f57
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/run_beamform_6ch_track.sh
@@ -0,0 +1 @@
+../../asr1/local/run_beamform_6ch_track.sh
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/show_enhance_results.sh b/egs2/chime4/enh_asr1/local/show_enhance_results.sh
new file mode 120000
index 00000000000..7be0ac655cd
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/show_enhance_results.sh
@@ -0,0 +1 @@
+../../asr1/local/show_enhance_results.sh
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/simu_enhan_chime4_data_prep.sh b/egs2/chime4/enh_asr1/local/simu_enhan_chime4_data_prep.sh
new file mode 120000
index 00000000000..f1227dc8071
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/simu_enhan_chime4_data_prep.sh
@@ -0,0 +1 @@
+../../asr1/local/simu_enhan_chime4_data_prep.sh
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/simu_ext_chime4_data_prep.sh b/egs2/chime4/enh_asr1/local/simu_ext_chime4_data_prep.sh
new file mode 120000
index 00000000000..58b7195ba04
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/simu_ext_chime4_data_prep.sh
@@ -0,0 +1 @@
+../../enh1/local/simu_ext_chime4_data_prep.sh
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/simu_noisy_chime4_data_prep.sh b/egs2/chime4/enh_asr1/local/simu_noisy_chime4_data_prep.sh
new file mode 120000
index 00000000000..da4d7f621c7
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/simu_noisy_chime4_data_prep.sh
@@ -0,0 +1 @@
+../../enh1/local/simu_noisy_chime4_data_prep.sh
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/sym_channel.py b/egs2/chime4/enh_asr1/local/sym_channel.py
new file mode 120000
index 00000000000..9901c190202
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/sym_channel.py
@@ -0,0 +1 @@
+../../asr1/local/sym_channel.py
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/wsj_data_prep.sh b/egs2/chime4/enh_asr1/local/wsj_data_prep.sh
new file mode 120000
index 00000000000..2ba8ba465af
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/wsj_data_prep.sh
@@ -0,0 +1 @@
+../../asr1/local/wsj_data_prep.sh
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/local/wsj_format_data.sh b/egs2/chime4/enh_asr1/local/wsj_format_data.sh
new file mode 120000
index 00000000000..036fb8b8689
--- /dev/null
+++ b/egs2/chime4/enh_asr1/local/wsj_format_data.sh
@@ -0,0 +1 @@
+../../asr1/local/wsj_format_data.sh
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/path.sh b/egs2/chime4/enh_asr1/path.sh
new file mode 120000
index 00000000000..f2720c6899b
--- /dev/null
+++ b/egs2/chime4/enh_asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/enh_asr1/path.sh
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/pyscripts b/egs2/chime4/enh_asr1/pyscripts
new file mode 120000
index 00000000000..008f9bd4bc5
--- /dev/null
+++ b/egs2/chime4/enh_asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/enh_asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/run.sh b/egs2/chime4/enh_asr1/run.sh
new file mode 100755
index 00000000000..c42213e9441
--- /dev/null
+++ b/egs2/chime4/enh_asr1/run.sh
@@ -0,0 +1,45 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+
+extra_annotations=
+
+train_set=tr05_multi_noisy_si284 # tr05_multi_noisy (original training data) or tr05_multi_noisy_si284 (add si284 data)
+valid_set=dt05_multi_isolated_1ch_track
+test_sets="\
+dt05_real_isolated_1ch_track dt05_simu_isolated_1ch_track et05_real_isolated_1ch_track et05_simu_isolated_1ch_track \
+dt05_real_beamformit_2mics dt05_simu_beamformit_2mics et05_real_beamformit_2mics et05_simu_beamformit_2mics \
+dt05_real_beamformit_5mics dt05_simu_beamformit_5mics et05_real_beamformit_5mics et05_simu_beamformit_5mics \
+"
+
+enh_asr_config=conf/train_enh_asr_convtasnet_fbank_transformer.yaml
+inference_config=conf/decode_asr_transformer.yaml
+lm_config=conf/train_lm_transformer.yaml
+
+
+use_word_lm=false
+word_vocab_size=65000
+
+./enh_asr.sh \
+    --lang en \
+    --spk_num 1 \
+    --ref_channel 3 \
+    --local_data_opts "--extra-annotations ${extra_annotations}" \
+    --nlsyms_txt data/nlsyms.txt \
+    --token_type char \
+    --feats_type raw \
+    --feats_normalize utt_mvn \
+    --enh_asr_config "${enh_asr_config}" \
+    --inference_config "${inference_config}" \
+    --lm_config "${lm_config}" \
+    --use_word_lm ${use_word_lm} \
+    --word_vocab_size ${word_vocab_size} \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --bpe_train_text "data/${train_set}/text" \
+    --lm_train_text "data/${train_set}/text data/local/other_text/text" "$@"
diff --git a/egs2/chime4/enh_asr1/scripts b/egs2/chime4/enh_asr1/scripts
new file mode 120000
index 00000000000..6c0f28ef23c
--- /dev/null
+++ b/egs2/chime4/enh_asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/enh_asr1/scripts
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/steps b/egs2/chime4/enh_asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/chime4/enh_asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/chime4/enh_asr1/utils b/egs2/chime4/enh_asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/chime4/enh_asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/clarity21/enh1/README.md b/egs2/clarity21/enh1/README.md
new file mode 100644
index 00000000000..4b72dd15897
--- /dev/null
+++ b/egs2/clarity21/enh1/README.md
@@ -0,0 +1,31 @@
+<!-- Generated by scripts/utils/show_enh_score.sh -->
+<!-- These results are from the code before refactoring  -->
+
+# Clarity CEC1 First Clarity Enhancement Challenge
+
+See for https://github.com/claritychallenge/clarity_CC/blob/master/clarity_CEC1/INSTALL.md instructions on how to download the data
+
+For more info on Clarity CEC1 see https://claritychallenge.github.io/clarity_CEC1_doc/
+
+
+# RESULTS
+## Environments
+- date: `Tue Apr 12 20:54:54 UTC 2022`
+- python version: `3.9.7 (default, Sep 16 2021, 13:09:58)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.5a1`
+- pytorch version: `pytorch 1.10.1`
+- Git hash: `46eaa5eb6bea11cc33927392dca7888921491d8c`
+  - Commit date: `Sat Mar 26 22:35:44 2022 +0100`
+
+
+## enh_train_enh_beamformer_mvdr_raw
+
+config: conf/tuning/train_enh_beamformer_mvdr.yaml
+huggingface_link: https://huggingface.co/popcornell/clarity21_train_enh_beamformer_mvdr
+
+|dataset|STOI|SAR|SDR|SIR|
+|---|---|---|---|---|
+|enhanced_dev|0.96|13.02|13.02|0.00|
+
+
+
diff --git a/egs2/clarity21/enh1/cmd.sh b/egs2/clarity21/enh1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/clarity21/enh1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/clarity21/enh1/conf/pbs.conf b/egs2/clarity21/enh1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/clarity21/enh1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/clarity21/enh1/conf/queue.conf b/egs2/clarity21/enh1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/clarity21/enh1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/clarity21/enh1/conf/slurm.conf b/egs2/clarity21/enh1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/clarity21/enh1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/clarity21/enh1/conf/tuning/train_enh_beamformer_mvdr.yaml b/egs2/clarity21/enh1/conf/tuning/train_enh_beamformer_mvdr.yaml
new file mode 100644
index 00000000000..026dfce50d6
--- /dev/null
+++ b/egs2/clarity21/enh1/conf/tuning/train_enh_beamformer_mvdr.yaml
@@ -0,0 +1,75 @@
+optim: adam
+init: xavier_uniform
+max_epoch: 8
+batch_type: folded
+batch_size: 1
+num_workers: 4
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 0
+patience: 4
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: reducelronplateau
+scheduler_conf:
+    mode: min
+    factor: 0.5
+    patience: 1
+encoder: stft
+encoder_conf:
+    n_fft: 1024
+    hop_length: 512
+    use_builtin_complex: False
+decoder: stft
+decoder_conf:
+    n_fft: 1024
+    hop_length: 512
+separator: wpe_beamformer
+separator_conf:
+    num_spk: 1
+    loss_type: spectrum
+    # Dereverberation options
+    use_wpe:  False
+    # Beamformer options
+    use_beamformer: True
+    bnet_type: blstmp
+    blayers: 3
+    bunits: 300
+    bprojs: 320
+    badim: 320
+    ref_channel: 0
+    use_noise_mask: True
+    bnonlinear: sigmoid
+    beamformer_type: mvdr_souden
+    rtf_iterations: 2
+    bdropout_rate: 0.0
+    shared_power: True
+    # For numerical stability
+    diagonal_loading: False
+    diag_eps_wpe: 1e-4
+    diag_eps_bf: 1e-4
+    mask_flooring: False
+    flooring_thres_wpe: 1e-6
+    flooring_thres_bf: 1e-6
+    use_torch_solver: True
+
+criterions: 
+  # The first criterion
+  - name: snr
+    conf:
+      eps: 1e-8
+    # the wrapper for the current criterion
+    # for single-talker case, we simplely use fixed_order wrapper
+    wrapper: fixed_order
+    wrapper_conf:
+      weight: 1.0
\ No newline at end of file
diff --git a/egs2/clarity21/enh1/db.sh b/egs2/clarity21/enh1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/clarity21/enh1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/clarity21/enh1/enh.sh b/egs2/clarity21/enh1/enh.sh
new file mode 120000
index 00000000000..8fd33b0b191
--- /dev/null
+++ b/egs2/clarity21/enh1/enh.sh
@@ -0,0 +1 @@
+../../TEMPLATE/enh1/enh.sh
\ No newline at end of file
diff --git a/egs2/clarity21/enh1/local/data.sh b/egs2/clarity21/enh1/local/data.sh
new file mode 100644
index 00000000000..c5cd35efdb7
--- /dev/null
+++ b/egs2/clarity21/enh1/local/data.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+help_message=$(cat << EOF
+Usage: $0 --clarity_root <path> [--sample_rate <sample_rate>]
+
+  required argument:
+    --clarity_root: path to clarity dataset root folder. i.e. folder which contains train and dev subfolders.
+    NOTE:
+
+
+  optional argument:
+    [--sample_rate]: 16000 (default) or 44100
+EOF
+)
+
+clarity_root=
+sample_rate=16000
+
+. utils/parse_options.sh
+
+# check for sox
+! command -v sox &>/dev/null && echo "sox: command not found" && exit 1;
+
+python local/prep_data.py --clarity_root ${clarity_root} --fs ${sample_rate}
\ No newline at end of file
diff --git a/egs2/clarity21/enh1/local/path.sh b/egs2/clarity21/enh1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/clarity21/enh1/local/prep_data.py b/egs2/clarity21/enh1/local/prep_data.py
new file mode 100644
index 00000000000..fa61e757742
--- /dev/null
+++ b/egs2/clarity21/enh1/local/prep_data.py
@@ -0,0 +1,130 @@
+import argparse
+import json
+import os
+
+
+parser = argparse.ArgumentParser("Clarity")
+parser.add_argument(
+    "--clarity_root",
+    type=str,
+    help="Path to Clarity Challenge root folder "
+    "(Folder containing train, dev and metadata dirs)",
+)
+parser.add_argument(
+    "--fs",
+    type=int,
+    default=16000,
+    help="Sample rate to use, by default we resample to 16000 Hz",
+)
+
+
+def prepare_data(clarity_root, samplerate):
+
+    output_folder = "./data"
+    ids = {"train": set(), "dev": set()}
+
+    for ds_split in ids.keys():
+        metafile = os.path.join(
+            clarity_root, "metadata", "scenes.{}.json".format(ds_split)
+        )
+        with open(metafile, "r") as f:
+            metadata = json.load(f)
+        for ex in metadata:
+            ids[ds_split].add(ex["scene"])
+
+    for ds_split in ids.keys():
+        ids[ds_split] = sorted(list(ids[ds_split]))
+
+    # create wav.scp
+    for ds_split in ids.keys():
+        os.makedirs(os.path.join(output_folder, ds_split), exist_ok=True)
+        with open(os.path.join(output_folder, ds_split, "wav.scp"), "w") as f:
+            for ex_id in ids[ds_split]:
+                array_files = [
+                    os.path.join(
+                        clarity_root,
+                        ds_split,
+                        "scenes",
+                        "{}_mixed_CH{}.wav".format(ex_id, idx),
+                    )
+                    for idx in range(1, 4)
+                ]
+
+                assert all([os.path.exists(x) for x in array_files]), (
+                    "Some file do not seem to exist, "
+                    "please check your root folder, is the path correct ?"
+                )
+                array_files = " ".join(array_files)
+                f.write(
+                    "{} sox -M {} -c 6 -b 16 -r {} -t wav - |\n".format(
+                        ex_id, array_files, samplerate
+                    )
+                )
+
+        with open(os.path.join(output_folder, ds_split, "noise1.scp"), "w") as f:
+            for ex_id in ids[ds_split]:
+                array_file = os.path.join(
+                    clarity_root,
+                    ds_split,
+                    "scenes",
+                    "{}_interferer_CH1.wav".format(ex_id),
+                )
+                assert os.path.exists(array_file), (
+                    "Some file do not seem to exist, "
+                    "please check your root folder, is the path correct ?"
+                )
+                f.write(
+                    "{} sox {} -b 16 -r {} -t wav - remix 1 |\n".format(
+                        ex_id, array_file, samplerate
+                    )
+                )
+
+        with open(os.path.join(output_folder, ds_split, "spk1.scp"), "w") as f:
+            for ex_id in ids[ds_split]:
+                array_file = os.path.join(
+                    clarity_root, ds_split, "scenes", "{}_target_CH1.wav".format(ex_id)
+                )
+                assert os.path.exists(array_file), (
+                    "Some file do not seem to exist, "
+                    "please check your root folder, is the path correct ?"
+                )
+                f.write(
+                    "{} sox {}  -b 16 -r {} -t wav - remix 1 |\n".format(
+                        ex_id, array_file, samplerate
+                    )
+                )
+
+        with open(os.path.join(output_folder, ds_split, "noise.scp"), "w") as f:
+            for ex_id in ids[ds_split]:
+                array_file = os.path.join(
+                    clarity_root,
+                    ds_split,
+                    "scenes",
+                    "{}_interferer_CH1.wav".format(ex_id),
+                )
+                assert os.path.exists(array_file), (
+                    "Some file do not seem to exist, "
+                    "please check your root folder, is the path correct ?"
+                )
+                f.write(
+                    "{} sox {}  -b 16 -r {} -t wav - remix 1 |\n".format(
+                        ex_id, array_file, samplerate
+                    )
+                )
+
+        with open(os.path.join(output_folder, ds_split, "text.scp"), "w") as f:
+            for ex_id in ids[ds_split]:
+                f.write("{} dummy\n".format(ex_id))
+
+        with open(os.path.join(output_folder, ds_split, "utt2spk"), "w") as f:
+            for ex_id in ids[ds_split]:
+                f.write("{} dummy\n".format(ex_id))
+
+        with open(os.path.join(output_folder, ds_split, "spk2utt"), "w") as f:
+            for ex_id in ids[ds_split]:
+                f.write("dummy {}\n".format(ex_id))
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    prepare_data(args.clarity_root, args.fs)
diff --git a/egs2/clarity21/enh1/path.sh b/egs2/clarity21/enh1/path.sh
new file mode 120000
index 00000000000..eb217d35673
--- /dev/null
+++ b/egs2/clarity21/enh1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/enh1/path.sh
\ No newline at end of file
diff --git a/egs2/clarity21/enh1/pyscripts b/egs2/clarity21/enh1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/clarity21/enh1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/clarity21/enh1/run.sh b/egs2/clarity21/enh1/run.sh
new file mode 100755
index 00000000000..143f7c06240
--- /dev/null
+++ b/egs2/clarity21/enh1/run.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+sample_rate=16000 # by default we resample to 16k
+
+# put the path here to the clarity first enhancement challenge folder which contains
+# dev  hrir  metadata  train subfolders
+clarity_root=/raid/users/popcornell/Clarity/target_dir/clarity_CEC1_data/clarity_data/
+
+train_set=train
+valid_set=dev
+test_sets="dev"
+
+./enh.sh \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --fs ${sample_rate} \
+    --ngpu 1 \
+    --spk_num 1 \
+    --ref_channel 0 \
+    --local_data_opts "--clarity_root ${clarity_root} --sample_rate ${sample_rate}" \
+    --enh_config conf/tuning/train_enh_beamformer_mvdr.yaml \
+    --use_dereverb_ref false \
+    --use_noise_ref true \
+    --inference_model "valid.loss.best.pth" \
+    "$@"
diff --git a/egs2/clarity21/enh1/scripts b/egs2/clarity21/enh1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/clarity21/enh1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/clarity21/enh1/steps b/egs2/clarity21/enh1/steps
new file mode 120000
index 00000000000..69ab7056139
--- /dev/null
+++ b/egs2/clarity21/enh1/steps
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/steps
\ No newline at end of file
diff --git a/egs2/clarity21/enh1/utils b/egs2/clarity21/enh1/utils
new file mode 120000
index 00000000000..e18ae14b549
--- /dev/null
+++ b/egs2/clarity21/enh1/utils
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/utils
\ No newline at end of file
diff --git a/egs2/conferencingspeech21/enh1/cmd.sh b/egs2/conferencingspeech21/enh1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/conferencingspeech21/enh1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/conferencingspeech21/enh1/conf/pbs.conf b/egs2/conferencingspeech21/enh1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/conferencingspeech21/enh1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/conferencingspeech21/enh1/conf/queue.conf b/egs2/conferencingspeech21/enh1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/conferencingspeech21/enh1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/conferencingspeech21/enh1/conf/slurm.conf b/egs2/conferencingspeech21/enh1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/conferencingspeech21/enh1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/conferencingspeech21/enh1/conf/tuning/train_enh_beamformer_mvdr.yaml b/egs2/conferencingspeech21/enh1/conf/tuning/train_enh_beamformer_mvdr.yaml
new file mode 100644
index 00000000000..46784a14064
--- /dev/null
+++ b/egs2/conferencingspeech21/enh1/conf/tuning/train_enh_beamformer_mvdr.yaml
@@ -0,0 +1,72 @@
+optim: adam
+init: xavier_uniform
+max_epoch: 70
+batch_type: folded
+batch_size: 8
+num_workers: 4
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 0
+patience: 4
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: reducelronplateau
+scheduler_conf:
+    mode: min
+    factor: 0.5
+    patience: 1
+encoder: stft
+encoder_conf:
+    n_fft: 512
+    hop_length: 128
+    use_builtin_complex: False
+decoder: stft
+decoder_conf:
+    n_fft: 512
+    hop_length: 128
+separator: wpe_beamformer
+separator_conf:
+    num_spk: 1
+    loss_type: mask_mse
+    use_wpe: True
+    wnet_type: blstmp
+    wlayers: 3
+    wunits: 300
+    wprojs: 320
+    wdropout_rate: 0.0
+    taps: 5
+    delay: 3
+    use_dnn_mask_for_wpe: False
+    use_beamformer: True
+    bnet_type: blstmp
+    blayers: 3
+    bunits: 512
+    bprojs: 512
+    badim: 320
+    ref_channel: 0
+    use_noise_mask: False
+    beamformer_type: mvdr_souden
+    bdropout_rate: 0.0
+
+
+criterions: 
+  # The first criterion
+  - name: mse 
+    conf:
+      compute_on_mask: True
+      mask_type: PSM^2
+    # the wrapper for the current criterion
+    # for single-talker case, we simplely use fixed_order wrapper
+    wrapper: fixed_order
+    wrapper_conf:
+      weight: 1.0
diff --git a/egs2/conferencingspeech21/enh1/db.sh b/egs2/conferencingspeech21/enh1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/conferencingspeech21/enh1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/conferencingspeech21/enh1/enh.sh b/egs2/conferencingspeech21/enh1/enh.sh
new file mode 120000
index 00000000000..8fd33b0b191
--- /dev/null
+++ b/egs2/conferencingspeech21/enh1/enh.sh
@@ -0,0 +1 @@
+../../TEMPLATE/enh1/enh.sh
\ No newline at end of file
diff --git a/egs2/conferencingspeech21/enh1/local/config_from_generated.py b/egs2/conferencingspeech21/enh1/local/config_from_generated.py
new file mode 100755
index 00000000000..25758da1024
--- /dev/null
+++ b/egs2/conferencingspeech21/enh1/local/config_from_generated.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python
+
+# Copyright 2021  Shanghai Jiao Tong University (Authors: Wangyou Zhang)
+# Apache 2.0
+import argparse
+from pathlib import Path
+
+
+def construct_path_dict(wav_list):
+    path_dict = {}
+    with wav_list.open("r") as f:
+        for wavpath in f:
+            wavpath = wavpath.strip()
+            if not wavpath:
+                continue
+            wavname = Path(wavpath).expanduser().resolve().with_suffix("").name
+            path_dict[wavname] = wavpath
+    return path_dict
+
+
+def prepare_config(args):
+    audiodir = Path(args.audiodir).expanduser()
+    clean_list = Path(args.clean_list).expanduser().resolve()
+    noise_list = Path(args.noise_list).expanduser().resolve()
+    outfile = Path(args.outfile).expanduser().resolve()
+
+    speech_data = construct_path_dict(clean_list)
+    noise_data = construct_path_dict(noise_list)
+    audios = {
+        folder: {
+            path.with_suffix("").name: str(path)
+            for path in (audiodir / folder).rglob("*." + args.audio_format)
+        }
+        for folder in ("mix", "noreverb_ref", "reverb_ref")
+    }
+    keys = audios["mix"].keys()
+    assert keys == audios["noreverb_ref"].keys() == audios["reverb_ref"].keys()
+
+    with outfile.open("w") as out:
+        for name in keys:
+            path_clean, path_noise, path_rir, start_time, snr, scale = name.split("#")
+            path_clean = speech_data[path_clean]
+            path_noise = noise_data[path_noise]
+            out.write(
+                f"{path_clean} {start_time} {path_noise} "
+                f"/path/{args.tag}/{path_rir}.wav {snr} {scale}\n"
+            )
+
+
+def get_parser():
+    """Argument parser."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--audiodir",
+        type=str,
+        required=True,
+        help="Paths to the directory containing simulated audio files",
+    )
+    parser.add_argument("--audio-format", type=str, default="wav")
+    parser.add_argument(
+        "--clean_list",
+        type=str,
+        required=True,
+        help="Path to the list of clean speech audio file for simulation",
+    )
+    parser.add_argument(
+        "--noise_list",
+        type=str,
+        required=True,
+        help="Path to the list of noise audio file for simulation",
+    )
+    parser.add_argument("--outfile", type=str, required=True)
+    parser.add_argument("--tag", type=str, default="linear")
+    return parser
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+    prepare_config(args)
diff --git a/egs2/conferencingspeech21/enh1/local/data.sh b/egs2/conferencingspeech21/enh1/local/data.sh
new file mode 100755
index 00000000000..578370cbaa9
--- /dev/null
+++ b/egs2/conferencingspeech21/enh1/local/data.sh
@@ -0,0 +1,359 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+help_message=$(cat << EOF
+Usage: $0 [--stage <stage>] [--stop_stage <stop_stage>] --use_reverb_ref <true/false> --official-data-dir <official_data_dir>
+
+  required argument:
+    --official-data-dir: path to the directory of offical data for ConferencingSpeech2021 with the following structure:
+
+        <official_data_dir>
+         |-- Development_test_set/
+         |   |-- playback+noise/
+         |   |-- readme.txt
+         |   |-- realrecording_cut/
+         |   |-- semireal+noise/
+         |   |-- simu_multiple_MA/
+         |   \-- simu_single_MA/
+         |
+         |-- Training_set/
+         |   |-- circle_rir/
+         |   |-- linear_rir/
+         |   |-- non_uniform_linear_rir/
+         |   |-- readme.txt
+         |   |-- selected_lists/
+         |   \-- train_record_noise/
+         |
+         |-- Evaluation_set/
+         |   |-- eval_data/
+         |   |   |--tast1/
+         |   |   \--task2/
+         |   \-- Readme.txt
+         |
+         \-- config_files_simulation_train/
+             |-- train_simu_circle.config
+             |-- train_simu_linear.config
+             \-- train_simu_non_uniform.config
+
+  optional argument:
+    [--stage]: 1 (default) or 4
+    [--stop_stage]: 1 or 4 (default)
+    [--use_reverb_ref]: true or false (default)
+EOF
+)
+
+
+stage=1
+stop_stage=4
+official_data_dir=
+use_official_dev=true
+use_reverb_ref=true
+
+log "$0 $*"
+. utils/parse_options.sh
+
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+if [ $# -gt 0 ]; then
+    log "${help_message}"
+    exit 2
+fi
+
+if [ ! -e "${official_data_dir}" ]; then
+    log "${help_message}"
+    log "No such directory for --official-data-dir: '${official_data_dir}'"
+    exit 1
+fi
+
+if [ ! -e "${AISHELL}" ]; then
+    log "Fill the value of 'AISHELL' in db.sh"
+    log "(available at http://openslr.org/33/)"
+    exit 1
+fi
+
+if [ ! -e "${AISHELL3}" ]; then
+    log "Fill the value of 'AISHELL3' in db.sh"
+    log "(available at http://openslr.org/93/)"
+    exit 1
+fi
+
+if [ ! -e "${LIBRISPEECH}" ]; then
+    log "Fill the value of 'LIBRISPEECH' in db.sh"
+    log "(available at http://openslr.org/12/)"
+    exit 1
+elif [ ! -e "${LIBRISPEECH}/train-clean-360" ]; then
+    log "Please ensure '${LIBRISPEECH}/train-clean-360' exists"
+    exit 1
+fi
+
+if [ ! -e "${VCTK}" ]; then
+    log "Fill the value of 'VCTK' in db.sh"
+    log "(Version 0.80, available at https://datashare.ed.ac.uk/handle/10283/2651)"
+    exit 1
+fi
+
+if [ ! -e "${MUSAN}" ]; then
+    log "Fill the value of 'MUSAN' in db.sh"
+    log "(available at http://openslr.org/17/)"
+    exit 1
+fi
+
+if [ ! -e "${AUDIOSET}" ]; then
+    log "Fill the value of 'AUDIOSET' in db.sh"
+    log "(available at https://github.com/marc-moreaux/audioset_raw)"
+    exit 1
+fi
+
+
+odir="${PWD}/local"
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: Prepare Training and Dev Data for Simulation"
+
+    if [ ! -d "${odir}/ConferencingSpeech2021" ]; then
+        git clone https://github.com/ConferencingSpeech/ConferencingSpeech2021.git "${odir}/ConferencingSpeech2021"
+    fi
+    (
+        cd "${odir}/ConferencingSpeech2021"
+        # This patch is for simulation/mix_wav.py at commit 49d3b2fc47
+        git apply "${odir}/fix_simulation_script.patch"
+        python -m pip install -r requirements.txt
+    )
+
+    rir_dir="${official_data_dir}/Training_set"
+
+    # make symbolic links for each corpus to match the data preparation script
+    corpora_dir="${odir}/ConferencingSpeech2021/corpora"
+    mkdir -p "${corpora_dir}"
+    ln -s "${AISHELL}" "${corpora_dir}/aishell_1"
+    ln -s "${AISHELL3}" "${corpora_dir}/aishell_3"
+    ln -s "${VCTK}" "${corpora_dir}/vctk"
+    ln -s "${LIBRISPEECH}/train-clean-360" "${corpora_dir}/librispeech_360"
+    ln -s "${MUSAN}" "${corpora_dir}/musan"
+    ln -s "${AUDIOSET}" "${corpora_dir}/audioset"
+    ln -s "${rir_dir}/linear_rir" "${corpora_dir}/linear"
+    ln -s "${rir_dir}/circle_rir" "${corpora_dir}/circle"
+    ln -s "${rir_dir}/non_uniform_linear_rir" "${corpora_dir}/non_uniform"
+
+    sed -i -e "s#aishell_1='.*'#aishell_1='${corpora_dir}/aishell_1'#g" \
+        -e "s#aishell_3='.*'#aishell_3='${corpora_dir}/aishell_3'#g" \
+        -e "s#vctk='.*'#vctk='${corpora_dir}/vctk'#g" \
+        -e "s#librispeech='.*'#librispeech='${corpora_dir}/librispeech_360'#g" \
+        -e "s#musan='.*'#musan='${corpora_dir}/musan'#g" \
+        -e "s#audioset='.*'#audioset='${corpora_dir}/audioset'#g" \
+        -e "s#linear='.*'#linear='${corpora_dir}/linear'#g" \
+        -e "s#circle='.*'#circle='${corpora_dir}/circle'#g" \
+        -e "s#non_uniform='.*'#non_uniform='${corpora_dir}/non_uniform'#g" \
+        -e "s#find \${name_path} #find \${name_path}/ #g" \
+        "${odir}/ConferencingSpeech2021/simulation/prepare.sh"
+
+    # This script will generate ${odir}/ConferencingSpeech2021/simulation/data/{train,dev}_*.config
+    (
+        cd "${odir}/ConferencingSpeech2021/simulation"
+        # NOTE (wangyou): 1000+ samples in ConferencingSpeech2021/selected_list/train/audioset.name
+        # might be unavailable from YouTube due to violation of policies, copyright, and other causes.
+        # In this case, you may want to remove them from the list.
+        bash ./prepare.sh
+    )
+    # If the above script fail to finish successfully, please use the following command instead:
+    #
+    # local/prepare.sh \
+    #     --corpora_dir "${corpora_dir}" \
+    #     --selected_list_dir "${odir}/ConferencingSpeech2021/selected_lists" \
+    #     --outdir "${odir}/ConferencingSpeech2021/simulation/data"
+
+    # Fill ${odir}/ConferencingSpeech2021/simulation/data/dev_*.config with real paths
+    simu_data_path="${odir}/ConferencingSpeech2021/simulation/data"
+    for name in linear circle non_uniform; do
+        python local/prepare_simu_config.py \
+            "${simu_data_path}/dev_${name}_simu_mix.config" \
+            --clean_list "${simu_data_path}/dev_clean.lst" \
+            --noise_list "${simu_data_path}/dev_noise.lst" \
+            --rir_list "${simu_data_path}/dev_${name}_rir.lst" \
+            --outfile "${simu_data_path}/dev_${name}_simu_mix.config"
+    done
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: Data Simulation"
+
+    if ${use_official_dev}; then
+        log "Skip simulation (using official development data in track2)"
+
+        datadir="${odir}/ConferencingSpeech2021/simulation/data/wavs/dev"
+        for name in linear circle non_uniform; do
+            mkdir -p "${datadir}/simu_${name}"
+        done
+        track2dir="${official_data_dir}/Development_test_set/simu_multiple_MA"
+        for folder in mix reverb_ref noreverb_ref; do
+            ln -s "${track2dir}/dev_simu_linear_uniform_track2/${folder}" "${datadir}/simu_linear/${folder}"
+            ln -s "${track2dir}/dev_simu_circular_track2/${folder}" "${datadir}/simu_circle/${folder}"
+            ln -s "${track2dir}/dev_simu_linear_nonuniform_track2/${folder}" "${datadir}/simu_non_uniform/${folder}"
+        done
+        simu_data_path="${odir}/ConferencingSpeech2021/simulation/data"
+        for name in linear circle non_uniform; do
+            python local/config_from_generated.py \
+                --audiodir "${datadir}/simu_${name}" \
+                --audio-format wav \
+                --clean_list "${simu_data_path}/dev_clean.lst" \
+                --noise_list "${simu_data_path}/dev_noise.lst" \
+                --tag ${name} \
+                --outfile "${datadir}/simu_${name}/dev_${name}_simu_mix.config"
+        done
+    else
+        # Expected data to be generated:
+        # ${odir}/ConferencingSpeech2021/simulation/data/wav/dev/
+        #  |-- simu_circle/
+        #  |   |-- dev_circle_simu_mix.config
+        #  |   |-- mix/*.wav             (1588 samples * 8 ch * 6 sec)
+        #  |   |-- noreverb_ref/*.wav    (1588 samples * 8 ch * 6 sec)
+        #  |   \-- reverb_ref/*.wav      (1588 samples * 8 ch * 6 sec)
+        #  |-- simu_linear/
+        #  |   |-- dev_linear_simu_mix.config
+        #  |   |-- mix/*.wav             (1588 samples * 8 ch * 6 sec)
+        #  |   |-- noreverb_ref/*.wav    (1588 samples * 8 ch * 6 sec)
+        #  |   \-- reverb_ref/*.wav      (1588 samples * 8 ch * 6 sec)
+        #  \-- simu_non_uniform/
+        #      |-- dev_non_uniform_simu_mix.config
+        #      |-- mix/*.wav             (1588 samples * 8 ch * 6 sec)
+        #      |-- noreverb_ref/*.wav    (1588 samples * 8 ch * 6 sec)
+        #      \-- reverb_ref/*.wav      (1588 samples * 8 ch * 6 sec)
+        (
+            cd "${odir}/ConferencingSpeech2021/simulation"
+            for name in linear circle non_uniform; do
+                log "Simulating with dev_${name}_simu_mix.config"
+                python mix_wav.py \
+                    --mix_config_path data/dev_${name}_simu_mix.config \
+                    --save_dir data/wavs/dev/simu_${name}/ \
+                    --chunk_len 6 \
+                    --generate_config False
+            done
+        )
+    fi
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    log "stage 3: Prepare data directory"
+
+    tmpdir=$(mktemp -d /tmp/conferencingspeech.XXXX)
+    ##############################################
+    # Training data will be generated on the fly #
+    ##############################################
+    mkdir -p data/train
+    simu_data_path="${odir}/ConferencingSpeech2021/simulation/data"
+
+    # Prepare wav.scp and spk1.scp
+    sed -e 's/\.\(wav\|flac\)//' "${simu_data_path}/train_clean.lst" | \
+        awk -F '/' '{print $NF}' > "${tmpdir}/utt_clean.list"
+    paste -d' ' "${tmpdir}/utt_clean.list" "${simu_data_path}/train_clean.lst" | sort -u > data/train/wav.scp
+    cp data/train/wav.scp data/train/spk1.scp
+
+    # Prepare utt2spk for data from aishell_1, aishell_3, librispeech_360, and vctk
+    # path -> spkid (aishell_1): .../S0724/BAC009S0724W0121.wav -> S0724
+    # path -> spkid (aishell_3): .../SSB0261/SSB02610250.wav -> SSB0261
+    # path -> spkid (librispeech_360): .../7932/93470/7932-93470-0006.flac -> 7932-93470
+    # path -> spkid (vctk): .../p278/p278_202.wav -> p278
+    sed -e 's/\.\(wav\|flac\)//' "${simu_data_path}/train_clean.lst" | \
+        awk 'BEGIN{ FS="/" } {
+            if(match($0, "librispeech_360")) {i=NF-2; j=NF-1; printf("%s %s-%s\n",$NF,$i,$j)}
+            else {i=NF-1; printf("%s %s\n",$NF,$i)}
+        }' | sort -u > data/train/utt2spk
+    utils/utt2spk_to_spk2utt.pl data/train/utt2spk > data/train/spk2utt
+
+    # Prepare scp files of noises and RIRs for training (used for on-the-fly mixing)
+    # * The noise set is composed of two parts:
+    #   (1) selected from MUSAN and Audioset (25390 samples, ~120 hours)
+    #   (2) real meeting room noises recorded by high fidelity devices (98 clips, ~13 hours)
+    #   NOTE: different noise data may have different sample rates.
+    # * 28914 RIRs are simulated using the image method.
+    sed -e 's/\.\(wav\|flac\)//' "${simu_data_path}/train_noise.lst" | \
+        awk -F '/' '{print $NF}' > "${tmpdir}/utt_noise.list"
+    paste -d' ' "${tmpdir}/utt_noise.list" "${simu_data_path}/train_noise.lst" > data/train/noises.scp
+    find "${official_data_dir}/Training_set/train_record_noise/" -iname "*.wav" > "${tmpdir}/train_record_noise.list"
+    sed -e 's/\.\(wav\|flac\)//' "${tmpdir}/train_record_noise.list" | \
+        awk -F '/' '{print $NF}' > "${tmpdir}/utt_record_noise.list"
+    paste -d' ' "${tmpdir}/utt_record_noise.list" "${tmpdir}/train_record_noise.list" >> data/train/noises.scp
+
+    # NOTE: different RIRs may have different numbers of channels.
+    cat "${simu_data_path}"/train_{circle,linear,non_uniform}_rir.lst > "${tmpdir}/train_rir.list"
+    sed -e 's/\.wav//' "${tmpdir}/train_rir.list" | \
+        awk -F '/' '{print $NF}' > "${tmpdir}/utt_rir.list"
+    paste -d' ' "${tmpdir}/utt_rir.list" "${tmpdir}/train_rir.list" > data/train/rirs.scp
+
+    utils/validate_data_dir.sh --no-feats --no-text data/train
+
+    ####################
+    # Development data #
+    ####################
+    mkdir -p data/dev
+    if ${use_official_dev}; then
+        mkdir -p "${tmpdir}"/dev_{simu_circle,simu_linear,simu_non_uniform}
+        for name in linear circle non_uniform; do
+            python local/prepare_dev_data.py \
+                --audiodirs "${simu_data_path}"/wavs/dev/simu_${name}/mix \
+                --use_reverb_ref ${use_reverb_ref} \
+                --outdir "${tmpdir}"/dev_simu_${name} \
+                --uttid_suffix ${name} \
+                "${simu_data_path}"/wavs/dev/simu_${name}/dev_${name}_simu_mix.config
+        done
+        for f in spk1.scp utt2spk wav.scp; do
+            cat "${tmpdir}"/dev_{simu_circle,simu_linear,simu_non_uniform}/${f} | sort > data/dev/${f}
+        done
+    else
+        cat "${simu_data_path}"/wavs/dev/simu_circle/dev_circle_simu_mix.config \
+            "${simu_data_path}"/wavs/dev/simu_linear/dev_linear_simu_mix.config \
+            "${simu_data_path}"/wavs/dev/simu_non_uniform/dev_non_uniform_simu_mix.config \
+            > ${tmpdir}/dev.config
+        python local/prepare_dev_data.py \
+            --audiodirs "${simu_data_path}"/wavs/dev/{simu_circle,simu_linear,simu_non_uniform}/mix \
+            --use_reverb_ref ${use_reverb_ref} \
+            --outdir data/dev \
+            ${tmpdir}/dev.config
+
+        for f in spk1.scp utt2spk wav.scp; do
+            mv data/dev/${f} data/dev/.${f}
+            sort data/dev/.${f} > data/dev/${f}
+            rm data/dev/.${f}
+        done
+    fi
+    utils/utt2spk_to_spk2utt.pl data/dev/utt2spk > data/dev/spk2utt
+    utils/validate_data_dir.sh --no-feats --no-text data/dev
+
+    rm -rf "$tmpdir"
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    log "stage 4: Prepare test data"
+    tmpdir=$(mktemp -d /tmp/conferencingspeech.XXXX)
+    ########################
+    # Evaluation test data #
+    ########################
+    mkdir -p data/test
+    mkdir -p "${tmpdir}"/test_{simu_circle,simu_linear,simu_non_uniform}
+    for name in real-recording semi-real-playback semi-real-realspk; do
+        python local/prepare_test_data.py \
+            --audiodirs "${official_data_dir}"/Evaluation_set/eval_data/task2/${name} \
+            --outdir "${tmpdir}"/test_${name} \
+            --uttid_prefix "task2_${name}" 
+    done
+    for f in spk1.scp utt2spk wav.scp; do
+        cat "${tmpdir}"/test_{real-recording,semi-real-playback,semi-real-realspk}/${f} | sort > data/test/${f}
+    done
+    utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt
+    utils/validate_data_dir.sh --no-feats --no-text data/test
+
+    rm -rf "$tmpdir"
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/conferencingspeech21/enh1/local/fix_simulation_script.patch b/egs2/conferencingspeech21/enh1/local/fix_simulation_script.patch
new file mode 100644
index 00000000000..e9ec83dc19a
--- /dev/null
+++ b/egs2/conferencingspeech21/enh1/local/fix_simulation_script.patch
@@ -0,0 +1,98 @@
+diff --git a/requirements.txt b/requirements.txt
+index 8054697..3012192 100644
+--- a/requirements.txt
++++ b/requirements.txt
+@@ -3,5 +3,4 @@ numpy
+ pystoi
+ pesq
+ scipy
+-pyrirgen
+ librosa
+diff --git a/simulation/mix_wav.py b/simulation/mix_wav.py
+index 1cd05c1..d1cb9b6 100644
+--- a/simulation/mix_wav.py
++++ b/simulation/mix_wav.py
+@@ -79,8 +79,8 @@ def clip_data(data, start, segment_length):
+             tgt[st:st+data.shape[0]] += data
+             st = segment_length//3 * 2
+             tgt[st:st+data.shape[0]] += data
+-        
+-        else:
++
++        elif data_len < segment_length//2:
+             """
+             padding to A_A
+             """
+@@ -92,25 +92,60 @@ def clip_data(data, start, segment_length):
+             tgt[:segment_length//2] += data
+             st = segment_length//2
+             tgt[st:st+data.shape[0]] += data
+-    
++
++        elif data_len < segment_length:
++            # (wangyou) in case of outliers
++            """same as (start == -1)--if"""
++            if data_len % 4 == 0:
++                tgt[:data_len] += data
++                tgt[data_len:] += data[:segment_length-data_len]
++            elif data_len % 4 == 1:
++                tgt[:data_len] += data
++            elif data_len % 4 == 2:
++                tgt[-data_len:] += data
++            elif data_len % 4 == 3:
++                tgt[(segment_length-data_len)//2:(segment_length-data_len)//2+data_len] += data
++
++        else:
++            # (wangyou) in case of outliers
++            """same as (start == -1)--else"""
++            if data_len % 4 == 0 or data_len % 4 == 3:
++                tgt += data[(data_len-segment_length)//2:(data_len-segment_length)//2+segment_length]
++            elif data_len % 4 == 1:
++                tgt += data[:segment_length]
++            elif data_len % 4 == 2:
++                tgt += data[-segment_length:]
++
+     elif start == -1:
+         '''
+         this means segment_length < data_len*2
+         padding to A_A
+         '''
+-        if data_len % 4 == 0:
+-            tgt[:data_len] += data
+-            tgt[data_len:] += data[:segment_length-data_len]
+-        elif data_len % 4 == 1:
+-            tgt[:data_len] += data
+-        elif data_len % 4 == 2:
+-            tgt[-data_len:] += data
+-        elif data_len % 4 == 3:
+-            tgt[(segment_length-data_len)//2:(segment_length-data_len)//2+data_len] += data
+-    
++        if data_len < segment_length:
++            if data_len % 4 == 0:
++                tgt[:data_len] += data
++                tgt[data_len:] += data[:segment_length-data_len]
++            elif data_len % 4 == 1:
++                tgt[:data_len] += data
++            elif data_len % 4 == 2:
++                tgt[-data_len:] += data
++            elif data_len % 4 == 3:
++                tgt[(segment_length-data_len)//2:(segment_length-data_len)//2+data_len] += data
++
++        else:
++            # (wangyou) in case of outliers
++            if data_len % 4 == 0 or data_len % 4 == 3:
++                tgt += data[(data_len-segment_length)//2:(data_len-segment_length)//2+segment_length]
++            elif data_len % 4 == 1:
++                tgt += data[:segment_length]
++            elif data_len % 4 == 2:
++                tgt += data[-segment_length:]
++
+     else:
++        if start + segment_length > data_len:
++            data = np.pad(data, [0, start + segment_length - data_len], 'constant')
+         tgt += data[start:start+segment_length]
+-    
++
+     return tgt
+ 
+ def rms(data):
diff --git a/egs2/conferencingspeech21/enh1/local/path.sh b/egs2/conferencingspeech21/enh1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/conferencingspeech21/enh1/local/prepare.sh b/egs2/conferencingspeech21/enh1/local/prepare.sh
new file mode 100755
index 00000000000..ea17270abe4
--- /dev/null
+++ b/egs2/conferencingspeech21/enh1/local/prepare.sh
@@ -0,0 +1,80 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
+
+corpora_dir=
+selected_list_dir=
+outdir=
+
+echo "$0 $*"
+. utils/parse_options.sh
+
+
+. ./path.sh || exit 1;
+. ./db.sh || exit 1;
+
+
+mkdir -p "${outdir}"
+tmpdir=$(mktemp -d /tmp/cs21.XXXX)
+trap 'rm -rf "$tmpdir"' EXIT
+
+# prepare speech for training
+for name in aishell_3 aishell_1 vctk; do
+    python local/prepare_data_list.py \
+        --outfile "${tmpdir}/train_${name}.lst" \
+        --audiodirs "${corpora_dir}/${name}" \
+        --audio-format "wav" \
+        "${selected_list_dir}/train/${name}.name"
+done
+
+python local/prepare_data_list.py \
+    --outfile "${tmpdir}/train_librispeech_360.lst" \
+    --audiodirs "${corpora_dir}/librispeech_360" \
+    --audio-format "flac" \
+    "${selected_list_dir}/train/librispeech_360.name"
+
+cat "${tmpdir}"/train_{aishell_3,aishell_1,vctk,librispeech_360}.lst > "${outdir}/train_clean.lst"
+
+# prepare noise for training
+python local/prepare_data_list.py \
+    --outfile "${tmpdir}/musan.lst" \
+    --audiodirs "${corpora_dir}/musan" \
+    --audio-format "wav" \
+    "${selected_list_dir}/train/musan.name"
+
+python local/prepare_data_list.py \
+    --outfile "${tmpdir}/audioset.lst" \
+    --audiodirs "${corpora_dir}/audioset" \
+    --audio-format "wav" \
+    --ignore-missing-files True \
+    "${selected_list_dir}/train/audioset.name"
+
+cat "${tmpdir}"/{musan,audioset}.lst > "${outdir}/train_noise.lst"
+
+# prepare speech for development
+python local/prepare_data_list.py \
+    --outfile "${outdir}/dev_clean.lst" \
+    --audiodirs "${corpora_dir}/aishell_1" "${corpora_dir}/vctk" "${corpora_dir}/aishell_3" \
+    --audio-format "wav" \
+    "${selected_list_dir}/dev/clean.name"
+
+# prepare noise for development
+python local/prepare_data_list.py \
+    --outfile "${outdir}/dev_noise.lst" \
+    --audiodirs "${corpora_dir}/musan" \
+    --audio-format "wav" \
+    "${selected_list_dir}/dev/noise.name"
+
+# Prepare the simulated RIR lists for training and development
+for name in linear circle non_uniform; do
+    for mode in train dev; do
+        python local/prepare_data_list.py \
+            --outfile "${outdir}/${mode}_${name}_rir.lst" \
+            --audiodirs "${corpora_dir}/${name}" \
+            --audio-format "wav" \
+            "${selected_list_dir}/${mode}/${name}.name"
+    done
+done
diff --git a/egs2/conferencingspeech21/enh1/local/prepare_data_list.py b/egs2/conferencingspeech21/enh1/local/prepare_data_list.py
new file mode 100755
index 00000000000..5c330d6e99c
--- /dev/null
+++ b/egs2/conferencingspeech21/enh1/local/prepare_data_list.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python
+
+# Copyright 2021  Shanghai Jiao Tong University (Authors: Wangyou Zhang)
+# Apache 2.0
+import argparse
+from pathlib import Path
+
+from espnet2.utils.types import str2bool
+
+
+def prepare_data(args):
+    datalist = Path(args.datalist).expanduser().resolve()
+    audiodirs = [Path(audiodir).expanduser() for audiodir in args.audiodirs]
+    outfile = Path(args.outfile).expanduser().resolve()
+    audios = {
+        path.name: str(path)
+        for audiodir in audiodirs
+        for path in audiodir.rglob("*." + args.audio_format)
+    }
+    missing_files = []
+    with outfile.open("w") as out, datalist.open("r") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            wavname, others = line.split(maxsplit=1)
+            if args.ignore_missing_files:
+                if wavname not in audios:
+                    missing_files.append(wavname)
+                    continue
+            else:
+                assert wavname in audios, "No such file %s in %s" % (
+                    wavname,
+                    str([str(p) for p in audiodirs]),
+                )
+            out.write(audios[wavname] + " " + others + "\n")
+    if args.ignore_missing_files and len(missing_files) > 0:
+        print(
+            "{} wav missing files are skipped:\n{}".format(
+                len(missing_files), "\n  ".join(missing_files)
+            )
+        )
+
+
+def get_parser():
+    """Argument parser."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "datalist", type=str, help="Path to the list of audio files for training"
+    )
+    parser.add_argument("--outfile", type=str, required=True)
+    parser.add_argument(
+        "--audiodirs",
+        type=str,
+        nargs="+",
+        required=True,
+        help="Paths to the directories containing audio files",
+    )
+    parser.add_argument("--audio-format", type=str, default="wav")
+    parser.add_argument("--ignore-missing-files", type=str2bool, default=False)
+    return parser
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+    prepare_data(args)
diff --git a/egs2/conferencingspeech21/enh1/local/prepare_dev_data.py b/egs2/conferencingspeech21/enh1/local/prepare_dev_data.py
new file mode 100755
index 00000000000..7ea801511b8
--- /dev/null
+++ b/egs2/conferencingspeech21/enh1/local/prepare_dev_data.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python
+
+# Copyright 2021  Shanghai Jiao Tong University (Authors: Wangyou Zhang)
+# Apache 2.0
+import argparse
+from pathlib import Path
+import re
+
+from espnet2.fileio.datadir_writer import DatadirWriter
+from espnet2.utils.types import str2bool
+
+
+def prepare_data(args):
+    config_file = Path(args.config_file).expanduser().resolve()
+    audiodirs = [Path(audiodir).expanduser().resolve() for audiodir in args.audiodirs]
+    audios = {
+        path.stem: str(path)
+        for audiodir in audiodirs
+        for path in audiodir.rglob("*.wav")
+    }
+    suffix = "_" + args.uttid_suffix if args.uttid_suffix else ""
+    with DatadirWriter(args.outdir) as writer, config_file.open("r") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+
+            path_clean, start_time, path_noise, path_rir, snr, scale = line.split()
+            uttid = "#".join(
+                [
+                    Path(path_clean).stem,
+                    Path(path_noise).stem,
+                    Path(path_rir).stem,
+                    start_time,
+                    snr,
+                    scale,
+                ]
+            )
+            writer["wav.scp"][uttid + suffix] = audios[uttid]
+            if args.use_reverb_ref:
+                repl = r"/reverb_ref/\1"
+            else:
+                repl = r"/noreverb_ref/\1"
+            writer["spk1.scp"][uttid + suffix] = re.sub(
+                r"/mix/([^\\]+\.wav$)", repl, audios[uttid]
+            )
+            if "librispeech" in path_clean:
+                spkid = "-".join(path_clean.split("/")[-3:-1])
+            else:
+                spkid = path_clean.split("/")[-2]
+            writer["utt2spk"][uttid + suffix] = spkid
+
+
+def get_parser():
+    """Argument parser."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "config_file", type=str, help="Path to the list of audio files for training"
+    )
+    parser.add_argument(
+        "--audiodirs",
+        type=str,
+        nargs="+",
+        required=True,
+        help="Paths to the directories containing simulated audio files",
+    )
+    parser.add_argument(
+        "--uttid_suffix",
+        type=str,
+        default="",
+        help="suffix to be appended to each utterance ID",
+    )
+    parser.add_argument(
+        "--outdir",
+        type=str,
+        required=True,
+        help="Paths to the directory for storing *.scp, utt2spk, spk2utt",
+    )
+    parser.add_argument(
+        "--use_reverb_ref",
+        type=str2bool,
+        default=True,
+        help="True to use reverberant references, False to use non-reverberant ones",
+    )
+    return parser
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+    prepare_data(args)
diff --git a/egs2/conferencingspeech21/enh1/local/prepare_simu_config.py b/egs2/conferencingspeech21/enh1/local/prepare_simu_config.py
new file mode 100755
index 00000000000..ec6ab395f42
--- /dev/null
+++ b/egs2/conferencingspeech21/enh1/local/prepare_simu_config.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python
+
+# Copyright 2021  Shanghai Jiao Tong University (Authors: Wangyou Zhang)
+# Apache 2.0
+import argparse
+from pathlib import Path
+
+
+def construct_path_dict(wav_list):
+    path_dict = {}
+    with wav_list.open("r") as f:
+        for wavpath in f:
+            wavpath = wavpath.strip()
+            if not wavpath:
+                continue
+            wavname = Path(wavpath).expanduser().resolve().name
+            path_dict[wavname] = wavpath
+    return path_dict
+
+
+def prepare_config(args):
+    config = Path(args.config).expanduser().resolve()
+    clean_list = Path(args.clean_list).expanduser().resolve()
+    noise_list = Path(args.noise_list).expanduser().resolve()
+    rir_list = Path(args.rir_list).expanduser().resolve()
+    outfile = Path(args.outfile).expanduser().resolve()
+
+    speech_data = construct_path_dict(clean_list)
+    noise_data = construct_path_dict(noise_list)
+    rir_data = construct_path_dict(rir_list)
+
+    lines = []
+    with config.open("r") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+
+            path_clean, start_time, path_noise, path_rir, snr, scale = line.split()
+            path_clean = speech_data[Path(path_clean).name]
+            path_noise = noise_data[Path(path_noise).name]
+            path_rir = rir_data[Path(path_rir).name]
+            lines.append(
+                f"{path_clean} {start_time} {path_noise} {path_rir} {snr} {scale}\n"
+            )
+
+    with outfile.open("w") as out:
+        for line in lines:
+            out.write(line)
+
+
+def get_parser():
+    """Argument parser."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "config",
+        type=str,
+        help="Path to the config file for simulation",
+    )
+    parser.add_argument(
+        "--clean_list",
+        type=str,
+        required=True,
+        help="Path to the list of clean speech audio file for simulation",
+    )
+    parser.add_argument(
+        "--noise_list",
+        type=str,
+        required=True,
+        help="Path to the list of noise audio file for simulation",
+    )
+    parser.add_argument(
+        "--rir_list",
+        type=str,
+        required=True,
+        help="Path to the list of RIR audio file for simulation",
+    )
+    parser.add_argument("--outfile", type=str, required=True)
+    return parser
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+    prepare_config(args)
diff --git a/egs2/conferencingspeech21/enh1/local/prepare_test_data.py b/egs2/conferencingspeech21/enh1/local/prepare_test_data.py
new file mode 100755
index 00000000000..974c25d812c
--- /dev/null
+++ b/egs2/conferencingspeech21/enh1/local/prepare_test_data.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python
+
+# Copyright 2021  Shanghai Jiao Tong University (Authors: Jing Shi)
+# Apache 2.0
+import argparse
+from pathlib import Path
+
+from espnet2.fileio.datadir_writer import DatadirWriter
+
+
+def prepare_data(args):
+    audiodirs = [Path(audiodir).expanduser().resolve() for audiodir in args.audiodirs]
+    if args.uttid_prefix:
+        audios = {
+            "_".join([args.uttid_prefix, str(path.parent.stem), str(path.stem)]): str(
+                path
+            )
+            for audiodir in audiodirs
+            for path in audiodir.rglob("*.wav")
+        }
+    else:
+        audios = {
+            "_".join([path.parent, path.stem]): str(path)
+            for audiodir in audiodirs
+            for path in audiodir.rglob("*.wav")
+        }
+    with DatadirWriter(args.outdir) as writer:
+        for uttid, utt_path in audios.items():
+            writer["wav.scp"][uttid] = utt_path
+            writer["spk1.scp"][uttid] = utt_path
+            writer["utt2spk"][uttid] = uttid
+
+
+def get_parser():
+    """Argument parser."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--audiodirs",
+        type=str,
+        nargs="+",
+        required=True,
+        help="Paths to the directories containing simulated audio files",
+    )
+    parser.add_argument(
+        "--uttid_prefix",
+        type=str,
+        default="",
+        help="Prefix to be appended to each utterance ID",
+    )
+    parser.add_argument(
+        "--outdir",
+        type=str,
+        required=True,
+        help="Paths to the directory for storing *.scp, utt2spk, spk2utt",
+    )
+    return parser
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+    prepare_data(args)
diff --git a/egs2/conferencingspeech21/enh1/path.sh b/egs2/conferencingspeech21/enh1/path.sh
new file mode 120000
index 00000000000..eb217d35673
--- /dev/null
+++ b/egs2/conferencingspeech21/enh1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/enh1/path.sh
\ No newline at end of file
diff --git a/egs2/conferencingspeech21/enh1/pyscripts b/egs2/conferencingspeech21/enh1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/conferencingspeech21/enh1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/conferencingspeech21/enh1/run.sh b/egs2/conferencingspeech21/enh1/run.sh
new file mode 100755
index 00000000000..b978635a8e2
--- /dev/null
+++ b/egs2/conferencingspeech21/enh1/run.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+
+# run local/data.sh for more information
+official_data_dir=/path/to/ConferencingSpeech2021_data
+sample_rate=16k
+
+train_set=train
+valid_set=dev
+test_sets="test"
+
+./enh.sh \
+    --audio_format wav \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --fs ${sample_rate} \
+    --ngpu 1 \
+    --spk_num 1 \
+    --local_data_opts "--official_data_dir ${official_data_dir}" \
+    --enh_config conf/tuning/train_enh_beamformer_mvdr.yaml \
+    --use_dereverb_ref false \
+    --use_noise_ref false \
+    --inference_model "valid.loss.best.pth" \
+    "$@"
diff --git a/egs2/conferencingspeech21/enh1/scripts b/egs2/conferencingspeech21/enh1/scripts
new file mode 120000
index 00000000000..9aeb3f26509
--- /dev/null
+++ b/egs2/conferencingspeech21/enh1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/enh1/scripts
\ No newline at end of file
diff --git a/egs2/conferencingspeech21/enh1/steps b/egs2/conferencingspeech21/enh1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/conferencingspeech21/enh1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/conferencingspeech21/enh1/utils b/egs2/conferencingspeech21/enh1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/conferencingspeech21/enh1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/covost2/asr1/asr.sh b/egs2/covost2/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/covost2/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/covost2/asr1/cmd.sh b/egs2/covost2/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/covost2/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/covost2/asr1/conf/decode_asr.yaml b/egs2/covost2/asr1/conf/decode_asr.yaml
new file mode 120000
index 00000000000..f3f59d5ac2b
--- /dev/null
+++ b/egs2/covost2/asr1/conf/decode_asr.yaml
@@ -0,0 +1 @@
+tuning/decode_transformer.yaml
\ No newline at end of file
diff --git a/egs2/covost2/asr1/conf/fbank.conf b/egs2/covost2/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/covost2/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/covost2/asr1/conf/pbs.conf b/egs2/covost2/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/covost2/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/covost2/asr1/conf/pitch.conf b/egs2/covost2/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/covost2/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/covost2/asr1/conf/queue.conf b/egs2/covost2/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/covost2/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/covost2/asr1/conf/slurm.conf b/egs2/covost2/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/covost2/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/covost2/asr1/conf/train_asr.yaml b/egs2/covost2/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..00ea2f24791
--- /dev/null
+++ b/egs2/covost2/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+tuning/train_asr_transformer.yaml
\ No newline at end of file
diff --git a/egs2/covost2/asr1/conf/train_lm.yaml b/egs2/covost2/asr1/conf/train_lm.yaml
new file mode 100644
index 00000000000..5bc7b7fcbfc
--- /dev/null
+++ b/egs2/covost2/asr1/conf/train_lm.yaml
@@ -0,0 +1,14 @@
+lm_conf:
+    nlayers: 2
+    unit: 650
+optim: sgd        # or adam
+batch_type: folded
+batch_size: 64   # batch size in LM training
+max_epoch: 20      # if the data size is large, we can reduce this
+patience: 3
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
diff --git a/egs2/covost2/asr1/conf/tuning/decode_transformer.yaml b/egs2/covost2/asr1/conf/tuning/decode_transformer.yaml
new file mode 100644
index 00000000000..d89db079882
--- /dev/null
+++ b/egs2/covost2/asr1/conf/tuning/decode_transformer.yaml
@@ -0,0 +1,7 @@
+batch_size: 1
+beam_size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.5
+lm_weight: 0.3
diff --git a/egs2/covost2/asr1/conf/tuning/train_asr_transformer.yaml b/egs2/covost2/asr1/conf/tuning/train_asr_transformer.yaml
new file mode 100644
index 00000000000..a889e110cf3
--- /dev/null
+++ b/egs2/covost2/asr1/conf/tuning/train_asr_transformer.yaml
@@ -0,0 +1,52 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: "conv2d"
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+
+# decoder related
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: "embed"
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# minibatch related
+batch_type: folded
+batch_size: 32
+
+# optimization related
+optim: adam
+accum_grad: 2
+grad_clip: 5
+patience: 0
+max_epoch: 100
+optim_conf:
+    lr: 1.0
+scheduler: noamlr
+scheduler_conf:
+    warmup_steps: 25000
+
+# others:
+best_model_criterion:
+-  - valid
+   - acc
+   - max
+keep_nbest_models: 10
+
+init: xavier_uniform # Our empirical studies shows that this initialization
+                     # is very important to low-resource ASR training
diff --git a/egs2/covost2/asr1/db.sh b/egs2/covost2/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/covost2/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/covost2/asr1/local/data.sh b/egs2/covost2/asr1/local/data.sh
new file mode 100755
index 00000000000..bae3393ae0b
--- /dev/null
+++ b/egs2/covost2/asr1/local/data.sh
@@ -0,0 +1,84 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Carnegie Mellon University (Jiatong Shi)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+# general configuration
+stage=0       # start from 0 if you need to start from data preparation
+stop_stage=100
+SECONDS=0
+src_lang=es
+tgt_lang=en
+
+ . utils/parse_options.sh || exit 1;
+
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+if [ -z "${COVOST2}" ]; then
+    log "Fill the value of 'COVOST2' of db.sh"
+    exit 1
+fi
+mkdir -p ${COVOST2}
+
+if [ -z "${COMMONVOICE}" ]; then
+    log "Fill the value of 'COMMONVOICE' of db.sh"
+    exit 1
+fi
+mkdir -p ${COMMONVOICE}
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "stage 0: Data Downloading"
+
+    # base url for downloads.
+    data_url=https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-4-2019-12-10/${src_lang}.tar.gz
+
+    # Download CommonVoice
+    mkdir -p ${COMMONVOICE}/${src_lang}
+    local/download_and_untar_commonvoice.sh ${COMMONVOICE}/${src_lang} ${data_url} ${src_lang}.tar.gz
+
+    # Download translation
+    if [[ ${src_lang} != en ]]; then
+        wget --no-check-certificate https://dl.fbaipublicfiles.com/covost/covost_v2.${src_lang}_${tgt_lang}.tsv.tar.gz \
+            -P ${COVOST2}
+        tar -xzf ${COVOST2}/covost_v2.${src_lang}_${tgt_lang}.tsv.tar.gz -C ${COVOST2}
+    fi
+    wget --no-check-certificate https://dl.fbaipublicfiles.com/covost/covost2.zip \
+          -P ${COVOST2}
+    unzip ${COVOST2}/covost2.zip -d ${COVOST2}
+    # NOTE: some non-English target languages lack translation from English
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: Data Preparation"
+    # use underscore-separated names in data directories.
+    local/data_prep_commonvoice.pl "${COMMONVOICE}/${src_lang}" validated data/validated.${src_lang}
+
+    # text preprocessing (tokenization, case, punctuation marks etc.)
+    local/data_prep_covost2.sh ${COVOST2} ${src_lang} ${tgt_lang} || exit 1;
+    # NOTE: train/dev/test splits are different from original CommonVoice
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: ASR Adaptation"
+    for x in train dev test; do
+        cp data/${x}.${src_lang}-${tgt_lang}/text.lc.rm.${src_lang} data/${x}.${src_lang}-${tgt_lang}/text
+    done
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/covost2/asr1/local/data_prep_commonvoice.pl b/egs2/covost2/asr1/local/data_prep_commonvoice.pl
new file mode 120000
index 00000000000..665a6fbfa65
--- /dev/null
+++ b/egs2/covost2/asr1/local/data_prep_commonvoice.pl
@@ -0,0 +1 @@
+../../../../egs/covost2/asr1/local/data_prep_commonvoice.pl
\ No newline at end of file
diff --git a/egs2/covost2/asr1/local/data_prep_covost2.sh b/egs2/covost2/asr1/local/data_prep_covost2.sh
new file mode 120000
index 00000000000..ab5069cc162
--- /dev/null
+++ b/egs2/covost2/asr1/local/data_prep_covost2.sh
@@ -0,0 +1 @@
+../../../../egs/covost2/asr1/local/data_prep_covost2.sh
\ No newline at end of file
diff --git a/egs2/covost2/asr1/local/download_and_untar_commonvoice.sh b/egs2/covost2/asr1/local/download_and_untar_commonvoice.sh
new file mode 120000
index 00000000000..2c59b3f2047
--- /dev/null
+++ b/egs2/covost2/asr1/local/download_and_untar_commonvoice.sh
@@ -0,0 +1 @@
+../../../../egs/covost2/asr1/local/download_and_untar_commonvoice.sh
\ No newline at end of file
diff --git a/egs2/covost2/asr1/local/path.sh b/egs2/covost2/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/covost2/asr1/local/process_tsv.py b/egs2/covost2/asr1/local/process_tsv.py
new file mode 120000
index 00000000000..80316c81614
--- /dev/null
+++ b/egs2/covost2/asr1/local/process_tsv.py
@@ -0,0 +1 @@
+../../../../egs/covost2/asr1/local/process_tsv.py
\ No newline at end of file
diff --git a/egs2/covost2/asr1/path.sh b/egs2/covost2/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/covost2/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/covost2/asr1/pyscripts b/egs2/covost2/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/covost2/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/covost2/asr1/run.sh b/egs2/covost2/asr1/run.sh
new file mode 100755
index 00000000000..e40acf30dd7
--- /dev/null
+++ b/egs2/covost2/asr1/run.sh
@@ -0,0 +1,99 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+# language related
+src_lang=es
+tgt_lang=en
+# English (en)
+# French (fr)
+# German (de)
+# Spanish (es)
+# Catalan (ca)
+# Italian (it)
+# Russian (ru)
+# Chinese (zh-CN)
+# Portuguese (pt)
+# Persian (fa)
+# Estonian (et)
+# Mongolian (mn)
+# Dutch (nl)
+# Turkish (tr)
+# Arabic (ar)
+# Swedish (sv-SE)
+# Latvian (lv)
+# Slovenian (sl)
+# Tamil (ta)
+# Japanese (ja)
+# Indonesian (id)
+# Welsh (cy)
+nbpe=1000
+
+train_set=train.${src_lang}-${tgt_lang}
+train_dev=dev.${src_lang}-${tgt_lang}
+test_set="test.${src_lang}-${tgt_lang} dev.${src_lang}-${tgt_lang} "
+
+asr_config=conf/train_asr.yaml
+lm_config=conf/train_lm.yaml
+inference_config=conf/decode_asr.yaml
+
+# verify language directions
+is_exist=false
+is_low_resource=false
+if [[ ${src_lang} == en ]]; then
+    tgt_langs=de_ca_zh-CN_fa_et_mn_tr_ar_sv-SE_lv_sl_ta_ja_id_cy
+    for lang in $(echo ${tgt_langs} | tr '_' ' '); do
+        if [[ ${lang} == "${tgt_lang}" ]]; then
+            is_exist=true
+            break
+        fi
+    done
+else
+    lr_src_langs=it_ru_zh-CN_pt_fa_et_mn_nl_tr_ar_sv-SE_lv_sl_ta_ja_id_cy
+    for lang in $(echo ${lr_src_langs} | tr '_' ' '); do
+        if [[ ${lang} == "${src_lang}" ]]; then
+            is_low_resource=true
+            break
+        fi
+    done
+    src_langs=fr_de_es_ca_it_ru_zh-CN_pt_fa_et_mn_nl_tr_ar_sv-SE_lv_sl_ta_ja_id_cy
+    for lang in $(echo ${src_langs} | tr '_' ' '); do
+        if [[ ${lang} == "${src_lang}" ]]; then
+            is_exist=true
+            break
+        fi
+    done
+fi
+if [[ ${is_exist} == false ]]; then
+    echo "No language direction: ${src_lang} to ${tgt_lang}" && exit 1;
+fi
+
+if [ ${is_low_resource} = true ]; then
+    speed_perturb_factors="0.9 1.0 1.1"
+else
+    speed_perturb_factors="0.8 0.9 1.0 1.1 1.2"
+fi
+
+if [ ${src_lang} == ja ] || [ ${src_lang} == zh-CN ]; then
+    nbpe=4000
+fi
+
+./asr.sh \
+    --ngpu 1 \
+    --local_data_opts "--stage 0 --src_lang ${src_lang} --tgt_lang ${tgt_lang}" \
+    --use_lm false \
+    --lm_config "${lm_config}" \
+    --token_type bpe \
+    --nbpe $nbpe \
+    --feats_type raw \
+    --speed_perturb_factors "${speed_perturb_factors}" \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${train_dev}" \
+    --test_sets "${test_set}" \
+    --bpe_train_text "data/${train_set}/text" \
+    --lm_train_text "data/${train_set}/text" "$@"
diff --git a/egs2/covost2/asr1/scripts b/egs2/covost2/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/covost2/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/covost2/asr1/steps b/egs2/covost2/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/covost2/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/covost2/asr1/utils b/egs2/covost2/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/covost2/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/covost2/mt1/cmd.sh b/egs2/covost2/mt1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/covost2/mt1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/covost2/mt1/conf/decode_mt.yaml b/egs2/covost2/mt1/conf/decode_mt.yaml
new file mode 100644
index 00000000000..6570a89920d
--- /dev/null
+++ b/egs2/covost2/mt1/conf/decode_mt.yaml
@@ -0,0 +1,5 @@
+beam_size: 5
+lm_weight: 0.0
+maxlenratio: 1.6
+minlenratio: 0.0
+penalty: 0.2
diff --git a/egs2/covost2/mt1/conf/pbs.conf b/egs2/covost2/mt1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/covost2/mt1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/covost2/mt1/conf/queue.conf b/egs2/covost2/mt1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/covost2/mt1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/covost2/mt1/conf/slurm.conf b/egs2/covost2/mt1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/covost2/mt1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/covost2/mt1/conf/train_mt.yaml b/egs2/covost2/mt1/conf/train_mt.yaml
new file mode 120000
index 00000000000..050cda0e4d0
--- /dev/null
+++ b/egs2/covost2/mt1/conf/train_mt.yaml
@@ -0,0 +1 @@
+tuning/train_mt_transformer_lr3e-3_warmup10k_share_enc_dec_input_dropout0.3.yaml
\ No newline at end of file
diff --git a/egs2/covost2/mt1/conf/tuning/train_mt_transformer_lr3e-3_warmup10k_share_enc_dec_input_dropout0.3.yaml b/egs2/covost2/mt1/conf/tuning/train_mt_transformer_lr3e-3_warmup10k_share_enc_dec_input_dropout0.3.yaml
new file mode 100644
index 00000000000..8b2d8844238
--- /dev/null
+++ b/egs2/covost2/mt1/conf/tuning/train_mt_transformer_lr3e-3_warmup10k_share_enc_dec_input_dropout0.3.yaml
@@ -0,0 +1,59 @@
+frontend: embed     # embedding + positional encoding
+frontend_conf:
+    embed_dim: 512
+    positional_dropout_rate: 0.3
+
+encoder: transformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 6
+    dropout_rate: 0.3
+    positional_dropout_rate: 0.3
+    attention_dropout_rate: 0.3
+    input_layer: null
+    normalize_before: true
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 6
+    dropout_rate: 0.3
+    positional_dropout_rate: 0.3
+    self_attention_dropout_rate: 0.3
+    src_attention_dropout_rate: 0.3
+
+model_conf:
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    share_decoder_input_output_embed: false
+    share_encoder_decoder_input_embed: true
+
+num_att_plot: 1
+log_interval: 100
+num_workers: 2
+batch_type: numel
+batch_bins: 400000000
+accum_grad: 1
+max_epoch: 200
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+    lr: 0.003
+    betas:
+    - 0.9
+    - 0.98
+    eps: 0.000000001
+    weight_decay: 0.0001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 10000
diff --git a/egs2/covost2/mt1/db.sh b/egs2/covost2/mt1/db.sh
new file mode 120000
index 00000000000..a11c0666fa1
--- /dev/null
+++ b/egs2/covost2/mt1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/mt1/db.sh
\ No newline at end of file
diff --git a/egs2/covost2/mt1/local/data.sh b/egs2/covost2/mt1/local/data.sh
new file mode 120000
index 00000000000..18a913cdc75
--- /dev/null
+++ b/egs2/covost2/mt1/local/data.sh
@@ -0,0 +1 @@
+../../st1/local/data.sh
\ No newline at end of file
diff --git a/egs2/covost2/mt1/local/data_prep_commonvoice.pl b/egs2/covost2/mt1/local/data_prep_commonvoice.pl
new file mode 120000
index 00000000000..67d2c51579c
--- /dev/null
+++ b/egs2/covost2/mt1/local/data_prep_commonvoice.pl
@@ -0,0 +1 @@
+../../st1/local/data_prep_commonvoice.pl
\ No newline at end of file
diff --git a/egs2/covost2/mt1/local/data_prep_covost2.sh b/egs2/covost2/mt1/local/data_prep_covost2.sh
new file mode 120000
index 00000000000..21b916c502a
--- /dev/null
+++ b/egs2/covost2/mt1/local/data_prep_covost2.sh
@@ -0,0 +1 @@
+../../st1/local/data_prep_covost2.sh
\ No newline at end of file
diff --git a/egs2/covost2/mt1/local/download_and_untar_commonvoice.sh b/egs2/covost2/mt1/local/download_and_untar_commonvoice.sh
new file mode 120000
index 00000000000..6979fcaffea
--- /dev/null
+++ b/egs2/covost2/mt1/local/download_and_untar_commonvoice.sh
@@ -0,0 +1 @@
+../../st1/local/download_and_untar_commonvoice.sh
\ No newline at end of file
diff --git a/egs2/covost2/mt1/local/path.sh b/egs2/covost2/mt1/local/path.sh
new file mode 120000
index 00000000000..476483f066c
--- /dev/null
+++ b/egs2/covost2/mt1/local/path.sh
@@ -0,0 +1 @@
+../../st1/local/path.sh
\ No newline at end of file
diff --git a/egs2/covost2/mt1/local/process_tsv.py b/egs2/covost2/mt1/local/process_tsv.py
new file mode 120000
index 00000000000..8ec19bcb081
--- /dev/null
+++ b/egs2/covost2/mt1/local/process_tsv.py
@@ -0,0 +1 @@
+../../st1/local/process_tsv.py
\ No newline at end of file
diff --git a/egs2/covost2/mt1/mt.sh b/egs2/covost2/mt1/mt.sh
new file mode 120000
index 00000000000..9f4c1d5c0bb
--- /dev/null
+++ b/egs2/covost2/mt1/mt.sh
@@ -0,0 +1 @@
+../../TEMPLATE/mt1/mt.sh
\ No newline at end of file
diff --git a/egs2/covost2/mt1/path.sh b/egs2/covost2/mt1/path.sh
new file mode 120000
index 00000000000..a2d87d29a46
--- /dev/null
+++ b/egs2/covost2/mt1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/mt1/path.sh
\ No newline at end of file
diff --git a/egs2/covost2/mt1/pyscripts b/egs2/covost2/mt1/pyscripts
new file mode 120000
index 00000000000..bca5bde44f3
--- /dev/null
+++ b/egs2/covost2/mt1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/mt1/pyscripts
\ No newline at end of file
diff --git a/egs2/covost2/mt1/run.sh b/egs2/covost2/mt1/run.sh
new file mode 100755
index 00000000000..11265d7ec87
--- /dev/null
+++ b/egs2/covost2/mt1/run.sh
@@ -0,0 +1,99 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+# language related
+src_lang=es
+tgt_lang=en
+# English (en)
+# French (fr)
+# German (de)
+# Spanish (es)
+# Catalan (ca)
+# Italian (it)
+# Russian (ru)
+# Chinese (zh-CN)
+# Portuguese (pt)
+# Persian (fa)
+# Estonian (et)
+# Mongolian (mn)
+# Dutch (nl)
+# Turkish (tr)
+# Arabic (ar)
+# Swedish (sv-SE)
+# Latvian (lv)
+# Slovenian (sl)
+# Tamil (ta)
+# Japanese (ja)
+# Indonesian (id)
+# Welsh (cy)
+
+src_nbpe=1000
+tgt_nbpe=1000
+src_case=lc.rm
+tgt_case=lc.rm
+
+train_set=train.${src_lang}-${tgt_lang}
+train_dev=dev.${src_lang}-${tgt_lang}
+test_set="test.${src_lang}-${tgt_lang} dev.${src_lang}-${tgt_lang} "
+
+mt_config=conf/train_mt.yaml
+inference_config=conf/decode_mt.yaml
+
+# verify language directions
+is_exist=false
+if [[ ${src_lang} == en ]]; then
+    tgt_langs=de_ca_zh-CN_fa_et_mn_tr_ar_sv-SE_lv_sl_ta_ja_id_cy
+    for lang in $(echo ${tgt_langs} | tr '_' ' '); do
+        if [[ ${lang} == "${tgt_lang}" ]]; then
+            is_exist=true
+            break
+        fi
+    done
+else
+    src_langs=fr_de_es_ca_it_ru_zh-CN_pt_fa_et_mn_nl_tr_ar_sv-SE_lv_sl_ta_ja_id_cy
+    for lang in $(echo ${src_langs} | tr '_' ' '); do
+        if [[ ${lang} == "${src_lang}" ]]; then
+            is_exist=true
+            break
+        fi
+    done
+fi
+if [[ ${is_exist} == false ]]; then
+    echo "No language direction: ${src_lang} to ${tgt_lang}" && exit 1;
+fi
+
+if [ ${src_lang} == ja ] || [ ${src_lang} == zh-CN ]; then
+    src_nbpe=4000
+fi
+
+if [ ${tgt_lang} == ja ] || [ ${tgt_lang} == zh-CN ]; then
+    tgt_nbpe=4000
+fi
+
+./mt.sh \
+    --ngpu 1 \
+    --local_data_opts "--stage 0 --src_lang ${src_lang} --tgt_lang ${tgt_lang}" \
+    --use_lm false \
+    --feats_type raw \
+    --audio_format "flac.ark" \
+    --token_joint false \
+    --src_lang ${src_lang} \
+    --tgt_lang ${tgt_lang} \
+    --src_token_type "bpe" \
+    --src_nbpe $src_nbpe \
+    --tgt_token_type "bpe" \
+    --tgt_nbpe $tgt_nbpe \
+    --src_case ${src_case} \
+    --tgt_case ${tgt_case} \
+    --mt_config "${mt_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${train_dev}" \
+    --test_sets "${test_set}" \
+    --src_bpe_train_text "data/${train_set}/text.${src_case}.${src_lang}" \
+    --tgt_bpe_train_text "data/${train_set}/text.${tgt_case}.${tgt_lang}" \
+    --lm_train_text "data/${train_set}/text.${tgt_case}.${tgt_lang}"  "$@"
diff --git a/egs2/covost2/mt1/scripts b/egs2/covost2/mt1/scripts
new file mode 120000
index 00000000000..1c11b3c3c7b
--- /dev/null
+++ b/egs2/covost2/mt1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/mt1/scripts
\ No newline at end of file
diff --git a/egs2/covost2/mt1/steps b/egs2/covost2/mt1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/covost2/mt1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/covost2/mt1/utils b/egs2/covost2/mt1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/covost2/mt1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/covost2/st1/cmd.sh b/egs2/covost2/st1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/covost2/st1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/covost2/st1/conf/decode_st.yaml b/egs2/covost2/st1/conf/decode_st.yaml
new file mode 100644
index 00000000000..2967ee6fc0f
--- /dev/null
+++ b/egs2/covost2/st1/conf/decode_st.yaml
@@ -0,0 +1,5 @@
+batch_size: 1
+beam_size: 10
+nbest: 1
+lm_weight: 0.0
+
diff --git a/egs2/covost2/st1/conf/fbank.conf b/egs2/covost2/st1/conf/fbank.conf
new file mode 100644
index 00000000000..d75ddde4df8
--- /dev/null
+++ b/egs2/covost2/st1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=8000 
+--num-mel-bins=80
diff --git a/egs2/covost2/st1/conf/pbs.conf b/egs2/covost2/st1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/covost2/st1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/covost2/st1/conf/pitch.conf b/egs2/covost2/st1/conf/pitch.conf
new file mode 100644
index 00000000000..926bcfca92a
--- /dev/null
+++ b/egs2/covost2/st1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=8000
diff --git a/egs2/covost2/st1/conf/queue.conf b/egs2/covost2/st1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/covost2/st1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/covost2/st1/conf/slurm.conf b/egs2/covost2/st1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/covost2/st1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/covost2/st1/conf/train_st.yaml b/egs2/covost2/st1/conf/train_st.yaml
new file mode 120000
index 00000000000..9e97e84c995
--- /dev/null
+++ b/egs2/covost2/st1/conf/train_st.yaml
@@ -0,0 +1 @@
+tuning/train_transformer_st.yaml
\ No newline at end of file
diff --git a/egs2/covost2/st1/conf/tuning/train_transformer_st.yaml b/egs2/covost2/st1/conf/tuning/train_transformer_st.yaml
new file mode 100644
index 00000000000..168a6a7a174
--- /dev/null
+++ b/egs2/covost2/st1/conf/tuning/train_transformer_st.yaml
@@ -0,0 +1,87 @@
+# network architecture
+
+# frontend related
+frontend: default
+frontend_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 160
+
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: conv2d
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: embed
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+extra_asr_decoder: transformer
+extra_asr_decoder_conf:
+    input_layer: embed
+    num_blocks: 2
+    linear_units: 2048
+    dropout_rate: 0.1
+
+extra_mt_decoder: transformer
+extra_mt_decoder_conf:
+    input_layer: embed
+    num_blocks: 2
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# loss related
+model_conf:
+    asr_weight: 0.3
+    mt_weight: 0.0
+    mtlalpha: 1.0
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# optimization related
+optim: adam
+accum_grad: 2
+grad_clip: 3
+max_epoch: 50
+optim_conf:
+    lr: 2.5
+scheduler: noamlr
+scheduler_conf:
+    model_size: 256
+    warmup_steps: 25000
+
+# minibatch related
+batch_type: folded
+batch_bins: 64
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/covost2/st1/db.sh b/egs2/covost2/st1/db.sh
new file mode 120000
index 00000000000..39ab78266e5
--- /dev/null
+++ b/egs2/covost2/st1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/st1/db.sh
\ No newline at end of file
diff --git a/egs2/covost2/st1/local/data.sh b/egs2/covost2/st1/local/data.sh
new file mode 100755
index 00000000000..ca0912e743d
--- /dev/null
+++ b/egs2/covost2/st1/local/data.sh
@@ -0,0 +1,78 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Carnegie Mellon University (Jiatong Shi)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+# general configuration
+stage=0       # start from 0 if you need to start from data preparation
+stop_stage=100
+SECONDS=0
+src_lang=es
+tgt_lang=en
+
+ . utils/parse_options.sh || exit 1;
+
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+if [ -z "${COVOST2}" ]; then
+    log "Fill the value of 'COVOST2' of db.sh"
+    exit 1
+fi
+mkdir -p ${COVOST2}
+
+if [ -z "${COMMONVOICE}" ]; then
+    log "Fill the value of 'COMMONVOICE' of db.sh"
+    exit 1
+fi
+mkdir -p ${COMMONVOICE}
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "stage 0: Data Downloading"
+
+    # base url for downloads.
+    data_url=https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-4-2019-12-10/${src_lang}.tar.gz
+
+    # Download CommonVoice
+    mkdir -p ${COMMONVOICE}/${src_lang}
+    local/download_and_untar_commonvoice.sh ${COMMONVOICE}/${src_lang} ${data_url} ${src_lang}.tar.gz
+
+    # Download translation
+    if [[ ${src_lang} != en ]]; then
+        wget --no-check-certificate https://dl.fbaipublicfiles.com/covost/covost_v2.${src_lang}_${tgt_lang}.tsv.tar.gz \
+            -P ${COVOST2}
+        tar -xzf ${COVOST2}/covost_v2.${src_lang}_${tgt_lang}.tsv.tar.gz -C ${COVOST2}
+    fi
+    wget --no-check-certificate https://dl.fbaipublicfiles.com/covost/covost2.zip \
+          -P ${COVOST2}
+    unzip ${COVOST2}/covost2.zip -d ${COVOST2}
+    # NOTE: some non-English target languages lack translation from English
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: Data Preparation"
+    # use underscore-separated names in data directories.
+    local/data_prep_commonvoice.pl "${COMMONVOICE}/${src_lang}" validated data/validated.${src_lang}
+
+    # text preprocessing (tokenization, case, punctuation marks etc.)
+    local/data_prep_covost2.sh ${COVOST2} ${src_lang} ${tgt_lang} || exit 1;
+    # NOTE: train/dev/test splits are different from original CommonVoice
+fi
+
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/covost2/st1/local/data_prep_commonvoice.pl b/egs2/covost2/st1/local/data_prep_commonvoice.pl
new file mode 120000
index 00000000000..5e053ed441f
--- /dev/null
+++ b/egs2/covost2/st1/local/data_prep_commonvoice.pl
@@ -0,0 +1 @@
+../../asr1/local/data_prep_commonvoice.pl
\ No newline at end of file
diff --git a/egs2/covost2/st1/local/data_prep_covost2.sh b/egs2/covost2/st1/local/data_prep_covost2.sh
new file mode 120000
index 00000000000..bc9b98f915d
--- /dev/null
+++ b/egs2/covost2/st1/local/data_prep_covost2.sh
@@ -0,0 +1 @@
+../../asr1/local/data_prep_covost2.sh
\ No newline at end of file
diff --git a/egs2/covost2/st1/local/download_and_untar_commonvoice.sh b/egs2/covost2/st1/local/download_and_untar_commonvoice.sh
new file mode 120000
index 00000000000..021e6f9a4f1
--- /dev/null
+++ b/egs2/covost2/st1/local/download_and_untar_commonvoice.sh
@@ -0,0 +1 @@
+../../asr1/local/download_and_untar_commonvoice.sh
\ No newline at end of file
diff --git a/egs2/covost2/st1/local/path.sh b/egs2/covost2/st1/local/path.sh
new file mode 120000
index 00000000000..d404bdb839c
--- /dev/null
+++ b/egs2/covost2/st1/local/path.sh
@@ -0,0 +1 @@
+../../asr1/local/path.sh
\ No newline at end of file
diff --git a/egs2/covost2/st1/local/process_tsv.py b/egs2/covost2/st1/local/process_tsv.py
new file mode 120000
index 00000000000..2710e325be5
--- /dev/null
+++ b/egs2/covost2/st1/local/process_tsv.py
@@ -0,0 +1 @@
+../../asr1/local/process_tsv.py
\ No newline at end of file
diff --git a/egs2/covost2/st1/path.sh b/egs2/covost2/st1/path.sh
new file mode 120000
index 00000000000..8e43dca7d4d
--- /dev/null
+++ b/egs2/covost2/st1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/st1/path.sh
\ No newline at end of file
diff --git a/egs2/covost2/st1/pyscripts b/egs2/covost2/st1/pyscripts
new file mode 120000
index 00000000000..f8eb6b2ab7b
--- /dev/null
+++ b/egs2/covost2/st1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/st1/pyscripts
\ No newline at end of file
diff --git a/egs2/covost2/st1/run.sh b/egs2/covost2/st1/run.sh
new file mode 100755
index 00000000000..5cd66dbaf53
--- /dev/null
+++ b/egs2/covost2/st1/run.sh
@@ -0,0 +1,114 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+# language related
+src_lang=es
+tgt_lang=en
+# English (en)
+# French (fr)
+# German (de)
+# Spanish (es)
+# Catalan (ca)
+# Italian (it)
+# Russian (ru)
+# Chinese (zh-CN)
+# Portuguese (pt)
+# Persian (fa)
+# Estonian (et)
+# Mongolian (mn)
+# Dutch (nl)
+# Turkish (tr)
+# Arabic (ar)
+# Swedish (sv-SE)
+# Latvian (lv)
+# Slovenian (sl)
+# Tamil (ta)
+# Japanese (ja)
+# Indonesian (id)
+# Welsh (cy)
+
+src_nbpe=1000
+tgt_nbpe=1000
+src_case=lc.rm
+tgt_case=lc.rm
+
+train_set=train.${src_lang}-${tgt_lang}
+train_dev=dev.${src_lang}-${tgt_lang}
+test_set="test.${src_lang}-${tgt_lang} dev.${src_lang}-${tgt_lang} "
+
+st_config=conf/train_st.yaml
+inference_config=conf/decode_st.yaml
+
+# verify language directions
+is_exist=false
+is_low_resource=false
+if [[ ${src_lang} == en ]]; then
+    tgt_langs=de_ca_zh-CN_fa_et_mn_tr_ar_sv-SE_lv_sl_ta_ja_id_cy
+    for lang in $(echo ${tgt_langs} | tr '_' ' '); do
+        if [[ ${lang} == "${tgt_lang}" ]]; then
+            is_exist=true
+            break
+        fi
+    done
+else
+    lr_src_langs=it_ru_zh-CN_pt_fa_et_mn_nl_tr_ar_sv-SE_lv_sl_ta_ja_id_cy
+    for lang in $(echo ${lr_src_langs} | tr '_' ' '); do
+        if [[ ${lang} == "${src_lang}" ]]; then
+            is_low_resource=true
+            break
+        fi
+    done
+    src_langs=fr_de_es_ca_it_ru_zh-CN_pt_fa_et_mn_nl_tr_ar_sv-SE_lv_sl_ta_ja_id_cy
+    for lang in $(echo ${src_langs} | tr '_' ' '); do
+        if [[ ${lang} == "${src_lang}" ]]; then
+            is_exist=true
+            break
+        fi
+    done
+fi
+if [[ ${is_exist} == false ]]; then
+    echo "No language direction: ${src_lang} to ${tgt_lang}" && exit 1;
+fi
+
+if [ ${is_low_resource} = true ]; then
+    speed_perturb_factors="0.9 1.0 1.1"
+else
+    speed_perturb_factors="0.8 0.9 1.0 1.1 1.2"
+fi
+
+if [ ${src_lang} == ja ] || [ ${src_lang} == zh-CN ]; then
+    src_nbpe=4000
+fi
+
+if [ ${tgt_lang} == ja ] || [ ${tgt_lang} == zh-CN ]; then
+    tgt_nbpe=4000
+fi
+
+./st.sh \
+    --ngpu 1 \
+    --local_data_opts "--stage 0 --src_lang ${src_lang} --tgt_lang ${tgt_lang}" \
+    --speed_perturb_factors "${speed_perturb_factors}" \
+    --use_lm false \
+    --feats_type raw \
+    --audio_format "flac.ark" \
+    --token_joint false \
+    --src_lang ${src_lang} \
+    --tgt_lang ${tgt_lang} \
+    --src_token_type "bpe" \
+    --src_nbpe $src_nbpe \
+    --tgt_token_type "bpe" \
+    --tgt_nbpe $tgt_nbpe \
+    --src_case ${src_case} \
+    --tgt_case ${tgt_case} \
+    --st_config "${st_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${train_dev}" \
+    --test_sets "${test_set}" \
+    --src_bpe_train_text "data/${train_set}/text.${src_case}.${src_lang}" \
+    --tgt_bpe_train_text "data/${train_set}/text.${tgt_case}.${tgt_lang}" \
+    --lm_train_text "data/${train_set}/text.${tgt_case}.${tgt_lang}"  "$@"
diff --git a/egs2/covost2/st1/scripts b/egs2/covost2/st1/scripts
new file mode 120000
index 00000000000..c5b75bb5b0a
--- /dev/null
+++ b/egs2/covost2/st1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/st1/scripts
\ No newline at end of file
diff --git a/egs2/covost2/st1/st.sh b/egs2/covost2/st1/st.sh
new file mode 120000
index 00000000000..5c7465739e3
--- /dev/null
+++ b/egs2/covost2/st1/st.sh
@@ -0,0 +1 @@
+../../TEMPLATE/st1/st.sh
\ No newline at end of file
diff --git a/egs2/covost2/st1/steps b/egs2/covost2/st1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/covost2/st1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/covost2/st1/utils b/egs2/covost2/st1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/covost2/st1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/dns_icassp21/enh1/README.md b/egs2/dns_icassp21/enh1/README.md
new file mode 100644
index 00000000000..c10aa6ed255
--- /dev/null
+++ b/egs2/dns_icassp21/enh1/README.md
@@ -0,0 +1,22 @@
+<!-- Generated by ./scripts/utils/show_enh_score.sh -->
+# RESULTS
+## Environments
+- date: `Thu Apr 21 21:49:46 UTC 2022`
+- python version: `3.7.4 (default, Aug 13 2019, 20:35:49)  [GCC 7.3.0]`
+- espnet version: `espnet 202204`
+- pytorch version: `pytorch 1.10.1+cu111`
+- Git hash: `a8e5d3d0ef346728a1be74166d4030370b7386cf`
+  - Commit date: `Sun Apr 24 04:35:33 2022 +0900`
+
+
+## enh_train_enh_tcn_tf_raw
+
+- config: ./conf/tuning/train_enh_tcn_tf.yaml
+- Pretrained model: https://huggingface.co/espnet/dns_icassp21_enh_train_enh_tcn_tf_raw
+
+
+|dataset|STOI|SDR|SI_SNR|
+|---|---|---|---|
+|enhanced_cv_synthetic|0.93|18.96|18.79|
+|enhanced_tt_synthetic_track_1|0.77|14.19|12.15|
+
diff --git a/egs2/dns_icassp21/enh1/cmd.sh b/egs2/dns_icassp21/enh1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/dns_icassp21/enh1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/dns_icassp21/enh1/conf/pbs.conf b/egs2/dns_icassp21/enh1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/dns_icassp21/enh1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/dns_icassp21/enh1/conf/queue.conf b/egs2/dns_icassp21/enh1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/dns_icassp21/enh1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/dns_icassp21/enh1/conf/slurm.conf b/egs2/dns_icassp21/enh1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/dns_icassp21/enh1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/dns_icassp21/enh1/conf/train.yaml b/egs2/dns_icassp21/enh1/conf/train.yaml
new file mode 120000
index 00000000000..2b7a018712d
--- /dev/null
+++ b/egs2/dns_icassp21/enh1/conf/train.yaml
@@ -0,0 +1 @@
+tuning/train_enh_tcn_tf.yaml
\ No newline at end of file
diff --git a/egs2/dns_icassp21/enh1/conf/tuning/train_enh_tcn_tf.yaml b/egs2/dns_icassp21/enh1/conf/tuning/train_enh_tcn_tf.yaml
new file mode 100644
index 00000000000..aedfd53b710
--- /dev/null
+++ b/egs2/dns_icassp21/enh1/conf/tuning/train_enh_tcn_tf.yaml
@@ -0,0 +1,57 @@
+optim: adam
+init: xavier_uniform
+max_epoch: 100
+batch_type: folded
+batch_size: 128
+num_workers: 4
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 1.0e-7
+patience: 10
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: reducelronplateau
+scheduler_conf:
+    mode: min
+    factor: 0.7
+    patience: 1
+
+encoder: stft
+encoder_conf:
+    n_fft: 320
+    hop_length: 160
+decoder: stft
+decoder_conf:
+    n_fft: 320
+    hop_length: 160
+separator: tcn
+separator_conf:
+    num_spk: 1
+    layer: 4
+    stack: 3
+    bottleneck_dim: 128
+    hidden_dim: 512
+    kernel: 3
+    causal: True
+    norm_type: "gLN"
+    nonlinear: "relu"
+
+criterions: 
+  # The first criterion
+  - name: si_snr 
+    conf:
+      eps: 1.0e-7
+    wrapper: pit
+    wrapper_conf:
+      weight: 1.0
+      independent_perm: True
\ No newline at end of file
diff --git a/egs2/dns_icassp21/enh1/db.sh b/egs2/dns_icassp21/enh1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/dns_icassp21/enh1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/dns_icassp21/enh1/enh.sh b/egs2/dns_icassp21/enh1/enh.sh
new file mode 120000
index 00000000000..8fd33b0b191
--- /dev/null
+++ b/egs2/dns_icassp21/enh1/enh.sh
@@ -0,0 +1 @@
+../../TEMPLATE/enh1/enh.sh
\ No newline at end of file
diff --git a/egs2/dns_icassp21/enh1/local/data.sh b/egs2/dns_icassp21/enh1/local/data.sh
new file mode 100755
index 00000000000..e2fa6f5e133
--- /dev/null
+++ b/egs2/dns_icassp21/enh1/local/data.sh
@@ -0,0 +1,60 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+help_message=$(cat << EOF
+Usage: $0 [--stage <stage>] [--stop_stage <stop_stage>]
+  optional argument:
+    [--stage]: 1 (default) or 2
+    [--stop_stage]: 1 or 2 (default)
+    NOTE:
+        stage 1: Create the Data Mixture from the DNS scripts. You can skip this step when you already have the audio mixture for training.
+        stage 2: Prepare the data for ESPNet-se
+        You can get scripts by git clone -b icassp2021-final https://github.com/microsoft/DNS-Challenge.git DNS-Challenge
+        You can download the data by using download-dns-challenge-2.sh in the master branch without git lfs
+        In addition, "datasets/wideband/acoustic_params_wideband" and "datasets/wideband/dev_testset_wideband/track1" are required, which are not downloaded by the above script
+        For evaluation, synthetic data in the "datasets/wideband/dev_testset_wideband/track1" in the the interspeech2021/adddata branch is used 
+        To avoid issues related to hard-coded paths, please change the current directory to DNS-Challenge in noisyspeech_synthesizer_singleprocess.py
+        Also, please make sure the destination is under data/dns_wav
+EOF
+)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+stage=1
+stop_stage=2
+dns_wav=$PWD/data/dns_wav
+
+
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    echo "${help_message}"
+    exit 1;
+fi
+
+if [ ! -e "${DNS2}" ]; then
+    log "Fill the value of 'DNS2' of db.sh"
+    exit 1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: Data Simulation"
+    local/dns_create_mixture.sh ${DNS2} ${dns_wav}  || exit 1;
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: Data preparation"
+    # The following datasets will be created:
+    # {tr,cv}_synthetic tt_synthetic_track_1
+    local/dns_data_prep.sh  ${dns_wav} ${DNS2}/datasets/dev_testset_wideband || exit 1;
+fi
\ No newline at end of file
diff --git a/egs2/dns_icassp21/enh1/local/dns_create_mixture.sh b/egs2/dns_icassp21/enh1/local/dns_create_mixture.sh
new file mode 100755
index 00000000000..74f56b543c2
--- /dev/null
+++ b/egs2/dns_icassp21/enh1/local/dns_create_mixture.sh
@@ -0,0 +1,75 @@
+#!/usr/bin/env bash
+
+. utils/parse_options.sh
+. path.sh
+. cmd.sh
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 <dns> <dns_wav> "
+  echo " where <dns> is dns directory,"
+  echo " <dns_wav> is wav generation space."
+  exit 1;
+fi
+
+dns=$1
+dns_wav=$2
+
+rm -r data/ 2>/dev/null || true
+mkdir -p data/
+
+# modify path in the original noisyspeech_synthesizer.cfg
+configure=${dns}/noisyspeech_synthesizer.cfg
+train_cfg=data/noisyspeech_synthesizer.cfg
+
+
+if [ ! -f ${configure} ]; then
+  echo -e "Please check configurtion ${configure} exist"
+  exit 1;
+fi
+
+# input datas
+noise_dir=${dns}/datasets/noise
+speech_dir=${dns}/datasets/clean/read_speech
+
+# additional clean datas
+clean_singing=${dns}/datasets/clean/singing_voice
+clean_emotion=${dns}/datasets/clean/emotional_speech
+clean_mandarin=${dns}/datasets/clean/mandarin_speech
+
+# acoustic params
+rir_table_csv=${dns}/datasets/acoustic_params_wideband/RIR_table_simple.csv
+clean_speech_t60_csv=${dns}/datasets/acoustic_params_wideband/cleanspeech_table_t60_c50.csv
+
+# outputs
+noisy_wav=${dns_wav}/noisy
+clean_wav=${dns_wav}/clean
+noise_wav=${dns_wav}/noise
+log_dir=log
+
+# modify the input paths for "\" separated paths
+sed -e "/^noisy_destination/s#.*#noisy_destination:${noisy_wav}#g"  \
+    -e "/^clean_destination/s#.*#clean_destination:${clean_wav}#g"  \
+    -e "/^noise_destination/s#.*#noise_destination:${noise_wav}#g"  \
+    -e "/^noise_dir/s#.*#noise_dir:${noise_dir}#g"  \
+    -e "/^speech_dir/s#.*#speech_dir:${speech_dir}#g"  \
+    -e "/^clean_singing/s#.*#clean_singing:${clean_singing}#g"  \
+    -e "/^clean_emotion/s#.*#clean_emotion:${clean_emotion}#g"  \
+    -e "/^clean_mandarin/s#.*#clean_mandarin:${clean_mandarin}#g"  \
+    -e "/^rir_table_csv/s#.*#rir_table_csv:${rir_table_csv}#g"  \
+    -e "/^clean_speech_t60_csv/s#.*#clean_speech_t60_csv:${clean_speech_t60_csv}#g"  \
+    -e "/^log_dir/s#.*#log_dir:${log_dir}#g" ${configure} \
+  > ${train_cfg}
+
+# modify the path separator 
+sed -i -e 's:\\:/:g' ${rir_table_csv}
+
+
+mix_script=${dns}/noisyspeech_synthesizer_singleprocess.py
+
+if [ ! -f ${configure} -a -f ${mix_script} ]; then
+  echo -e "Please check configurtion ${configure} and mix_script ${mix_script} exist"
+  exit 1;
+fi
+
+echo "Creating Mixtures for Training and Validation Data."
+python ${mix_script} --cfg ${PWD}/${train_cfg} >/dev/null || exit 1;
\ No newline at end of file
diff --git a/egs2/dns_icassp21/enh1/local/dns_data_prep.sh b/egs2/dns_icassp21/enh1/local/dns_data_prep.sh
new file mode 100755
index 00000000000..07618a0a2d3
--- /dev/null
+++ b/egs2/dns_icassp21/enh1/local/dns_data_prep.sh
@@ -0,0 +1,158 @@
+#!/usr/bin/env bash
+
+. ./path.sh
+
+
+
+if [ $# -ne 2 ]; then
+  echo "Arguments should be DNS script path, DNS wav path and DNS data, see local/data.sh for example."
+  exit 1;
+fi
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+dns_wav=$1
+dns_test_wav=$2
+
+# check if the wav dirs exist.
+for ddir in clean noise noisy; do
+  f=${dns_wav}/${ddir}
+  if [ ! -d $f ]; then
+    echo "Error: $f is not a directory."
+    exit 1;
+  fi
+done
+
+#Synthetic test data
+for rev in track_1; do
+  for ddir in clean noisy; do
+    f=${dns_test_wav}/${rev}/synthetic/${ddir}
+    if [ ! -d $f ]; then
+      echo "Error: $f is not a directory."
+      exit 1;
+    fi
+  done
+done
+
+#Real_recordings test data
+for rev in track_1; do
+  for ddir in clean noisy; do
+    f=${dns_test_wav}/${rev}/real_recordings
+    if [ ! -d $f ]; then
+      echo "Error: $f is not a directory."
+      exit 1;
+    fi
+  done
+done
+
+data=./data
+rm -r ${data}/{tr, cv}_synthetic 2>/dev/null || true
+rm -r ${data}/tt_synthetic 2>/dev/null || true
+
+tmpdir=data/temp
+rm -r  $tmpdir 2>/dev/null || true
+mkdir -p $tmpdir 
+
+mixwav_dir=${dns_wav}/noisy
+
+find $mixwav_dir -iname '*.wav' | sort -u > $tmpdir/train_valid.flist
+
+sed -e 's:.*_\([0-9]*\).wav$:fileid_\1:i' $tmpdir/train_valid.flist \
+> $tmpdir/train_valid.uttids
+
+paste $tmpdir/train_valid.uttids $tmpdir/train_valid.flist \
+| sort -k1,1 >  $tmpdir/train_valid.scp
+
+num=$(wc -l $tmpdir/train_valid.scp | awk '{print $1}')
+train_num=$(($num*9/10))
+
+echo "Split 10% of the Training data to the Validation data"
+awk "NR<=$train_num" $tmpdir/train_valid.scp > $tmpdir/tr.scp
+awk "NR>$train_num" $tmpdir/train_valid.scp > $tmpdir/cv.scp
+
+for x in tr cv; do
+  ddir=${x}_synthetic
+  mkdir -p ${data}/${ddir}
+  cp $tmpdir/${x}.scp ${data}/${ddir}/wav.scp
+  
+  awk '{print($1, "dummy")}' ${data}/${ddir}/wav.scp | \
+    sort -u> ${data}/${ddir}/utt2spk
+  utt2spk_to_spk2utt.pl ${data}/${ddir}/utt2spk > ${data}/${ddir}/spk2utt
+
+  awk '{print($1, "dummy")}' ${data}/${ddir}/wav.scp | \
+    sort -u> ${data}/${ddir}/text
+
+  noise_wav_dir=${dns_wav}/noise/
+  sed -e "s#${mixwav_dir}.*_\(.*\).wav#${noise_wav_dir}noise_fileid_\1.wav#g" ${data}/${ddir}/wav.scp \
+    > ${data}/${ddir}/noise1.scp
+
+  spk1_wav_dir=${dns_wav}/clean/
+  sed -e "s#${mixwav_dir}.*_\(.*\).wav#${spk1_wav_dir}clean_fileid_\1.wav#g" ${data}/${ddir}/wav.scp \
+    > ${data}/${ddir}/spk1.scp
+done
+
+
+echo "Building testing data"
+
+for x in tt; do
+  echo "Building synthetic testing data"
+  for rev in track_1; do
+    ddir=${x}_synthetic_${rev}
+    mkdir -p ${data}/${ddir}
+    root_dir=${dns_test_wav}/${rev}/synthetic
+
+    mixwav_dir=${root_dir}/noisy/
+    find $mixwav_dir -iname '*.wav' > $tmpdir/${x}_${rev}.flist
+
+    sed -e 's:.*\/\(.*\).wav$:\1:i' $tmpdir/${x}_${rev}.flist \
+    > $tmpdir/${x}_${rev}.uttids    
+
+    paste $tmpdir/${x}_${rev}.uttids $tmpdir/${x}_${rev}.flist \
+    | sort -k1,1 >  ${data}/${ddir}/wav.scp 
+
+
+    awk '{print($1, "dummy")}' ${data}/${ddir}/wav.scp | \
+    sort -u> ${data}/${ddir}/utt2spk
+
+
+    awk '{print($1, "dummy")}' ${data}/${ddir}/wav.scp | \
+    sort -u> ${data}/${ddir}/text
+
+    utt2spk_to_spk2utt.pl ${data}/${ddir}/utt2spk > ${data}/${ddir}/spk2utt
+
+    spk1_wav_dir=${root_dir}/clean/
+    awk '{i=split($1, lst, "_"); spk="'"$spk1_wav_dir"'"lst[1]"_"lst[2]"_clean_fileid_"lst[i]".wav";
+    gsub("_german","",spk); gsub("_french","",spk); gsub("_italian","",spk);
+    print($1, spk)}' ${data}/${ddir}/wav.scp \
+    > ${data}/${ddir}/spk1.scp
+  done
+
+  echo "Building real testing data"
+  ddir=${x}_real_recordings
+  mkdir -p ${data}/${ddir}
+  real_dir=${dns_test_wav}/track_1/real_recordings
+
+  find $real_dir -iname '*.wav' > $tmpdir/${x}_real_recordings.flist
+  
+  sed -e 's:.*\/\(.*\).wav$:\1:i' $tmpdir/${x}_real_recordings.flist \
+  > $tmpdir/${x}_real_recordings.uttids
+
+  paste $tmpdir/${x}_real_recordings.uttids $tmpdir/${x}_real_recordings.flist \
+  | sort -k1,1 >  ${data}/${ddir}/wav.scp 
+
+
+  awk '{print($1, "dummy")}' ${data}/${ddir}/wav.scp | \
+  sort -u> ${data}/${ddir}/utt2spk
+
+
+  awk '{print($1, "dummy")}' ${data}/${ddir}/wav.scp | \
+  sort -u> ${data}/${ddir}/text
+
+  utt2spk_to_spk2utt.pl ${data}/${ddir}/utt2spk > ${data}/${ddir}/spk2utt
+
+
+done
\ No newline at end of file
diff --git a/egs2/dns_icassp21/enh1/local/path.sh b/egs2/dns_icassp21/enh1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/dns_icassp21/enh1/path.sh b/egs2/dns_icassp21/enh1/path.sh
new file mode 120000
index 00000000000..eb217d35673
--- /dev/null
+++ b/egs2/dns_icassp21/enh1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/enh1/path.sh
\ No newline at end of file
diff --git a/egs2/dns_icassp21/enh1/pyscripts b/egs2/dns_icassp21/enh1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/dns_icassp21/enh1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/dns_icassp21/enh1/run.sh b/egs2/dns_icassp21/enh1/run.sh
new file mode 100755
index 00000000000..b2dda44031e
--- /dev/null
+++ b/egs2/dns_icassp21/enh1/run.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+sample_rate=16k
+
+
+train_set=tr_synthetic
+valid_set=cv_synthetic
+test_sets="tt_synthetic_track_1"
+
+./enh.sh \
+    --lang en \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --fs ${sample_rate} \
+    --ngpu 1 \
+    --spk_num 1 \
+    --local_data_opts "" \
+    --enh_config ./conf/train.yaml \
+    --use_dereverb_ref false \
+    --use_noise_ref false \
+    --max_wav_duration 31 \
+    --inference_model "valid.loss.best.pth" \
+    "$@"
diff --git a/egs2/dns_icassp21/enh1/scripts b/egs2/dns_icassp21/enh1/scripts
new file mode 120000
index 00000000000..9aeb3f26509
--- /dev/null
+++ b/egs2/dns_icassp21/enh1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/enh1/scripts
\ No newline at end of file
diff --git a/egs2/dns_icassp21/enh1/steps b/egs2/dns_icassp21/enh1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/dns_icassp21/enh1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/dns_icassp21/enh1/utils b/egs2/dns_icassp21/enh1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/dns_icassp21/enh1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/dns_ins20/enh1/README.md b/egs2/dns_ins20/enh1/README.md
index 1f2b00b41df..d64724bfd65 100644
--- a/egs2/dns_ins20/enh1/README.md
+++ b/egs2/dns_ins20/enh1/README.md
@@ -44,3 +44,27 @@
 
 Note: Here, the model is only trained on data without reverberation.
 Note: Here, the PESQ score is calculated based on https://github.com/vBaiCai/python-pesq.
+
+
+<!-- Generated by ./scripts/utils/show_enh_score.sh -->
+# RESULTS
+## Environments
+- date: `Sun Apr 24 23:59:01 EDT 2022`
+- python version: `3.8.12 (default, Oct 12 2021, 13:49:34)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.6a1`
+- pytorch version: `pytorch 1.10.2+cu102`
+- Git hash: `21c02c8f578b9860e6bf38c86a0bd7cd0412c7f8`
+  - Commit date: `Sun Feb 6 15:37:51 2022 -0500`
+
+
+## enh_train_enh_conv_tasnet_raw
+
+- config: ./conf/tuning/train_enh_conv_tasnet.yaml
+- model: https://huggingface.co/muqiaoy/muqiaoy_dns_ins20_enh_train_enh_conv_tasnet_raw
+
+
+| dataset                           | STOI | SAR   | SDR   | SI_SNR  |
+| --------------------------------- | ---- | ----- | ----- | ---- |
+| enhanced_cv_synthetic             | 0.97 | 24.52 | 24.52 | 24.43 |
+| enhanced_tt_synthetic_no_reverb   | 0.96 | 17.66 | 17.66 | 17.69 |
+| enhanced_tt_synthetic_with_reverb | 0.84 | 11.84 | 11.84 | 11.15 |
diff --git a/egs2/dns_ins20/enh1/conf/tuning/train_enh_conv_tasnet.yaml b/egs2/dns_ins20/enh1/conf/tuning/train_enh_conv_tasnet.yaml
new file mode 100644
index 00000000000..266c711a641
--- /dev/null
+++ b/egs2/dns_ins20/enh1/conf/tuning/train_enh_conv_tasnet.yaml
@@ -0,0 +1,60 @@
+optim: adam
+init: xavier_uniform
+max_epoch: 100
+batch_type: folded
+batch_size: 16 # batch_size 16 can be trained on 4 RTX 2080ti
+iterator_type: chunk
+chunk_length: 40000
+num_workers: 4
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 0
+patience: 4
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: reducelronplateau
+scheduler_conf:
+    mode: min
+    factor: 0.5
+    patience: 1
+encoder: conv
+encoder_conf:
+    channel: 256
+    kernel_size: 20
+    stride: 10
+decoder: conv
+decoder_conf:
+    channel: 256
+    kernel_size: 20
+    stride: 10
+separator: tcn
+separator_conf:
+    num_spk: 1
+    layer: 8
+    stack: 4
+    bottleneck_dim: 256
+    hidden_dim: 512
+    kernel: 3
+    causal: False
+    norm_type: "gLN"
+    nonlinear: relu
+
+criterions: 
+  # The first criterion
+  - name: si_snr 
+    conf:
+      eps: 1.0e-7
+    wrapper: pit
+    wrapper_conf:
+      weight: 1.0
+      independent_perm: True
diff --git a/egs2/dns_ins21/enh1/cmd.sh b/egs2/dns_ins21/enh1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/dns_ins21/enh1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/dns_ins21/enh1/conf/pbs.conf b/egs2/dns_ins21/enh1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/dns_ins21/enh1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/dns_ins21/enh1/conf/queue.conf b/egs2/dns_ins21/enh1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/dns_ins21/enh1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/dns_ins21/enh1/conf/slurm.conf b/egs2/dns_ins21/enh1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/dns_ins21/enh1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/dns_ins21/enh1/conf/train.yaml b/egs2/dns_ins21/enh1/conf/train.yaml
new file mode 120000
index 00000000000..2b7a018712d
--- /dev/null
+++ b/egs2/dns_ins21/enh1/conf/train.yaml
@@ -0,0 +1 @@
+tuning/train_enh_tcn_tf.yaml
\ No newline at end of file
diff --git a/egs2/dns_ins21/enh1/conf/tuning/train_enh_tcn_tf.yaml b/egs2/dns_ins21/enh1/conf/tuning/train_enh_tcn_tf.yaml
new file mode 100644
index 00000000000..bb1a8962cff
--- /dev/null
+++ b/egs2/dns_ins21/enh1/conf/tuning/train_enh_tcn_tf.yaml
@@ -0,0 +1,56 @@
+optim: adam
+init: xavier_uniform
+max_epoch: 100
+batch_type: folded
+batch_size:  64
+num_workers: 4
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 1.0e-7
+patience: 10
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: reducelronplateau
+scheduler_conf:
+    mode: min
+    factor: 0.7
+    patience: 1
+
+encoder: stft
+encoder_conf:
+    n_fft: 320
+    hop_length: 160
+decoder: stft
+decoder_conf:
+    n_fft: 320
+    hop_length: 160
+separator: tcn
+separator_conf:
+    num_spk: 1
+    layer: 8
+    stack: 3
+    bottleneck_dim: 128
+    hidden_dim: 512
+    kernel: 3
+    causal: True
+    norm_type: "gLN"
+    nonlinear: "relu"
+
+criterions: 
+  # The first criterion
+  - name: si_snr 
+    conf:
+      eps: 1.0e-7
+    wrapper: fixed_order
+    wrapper_conf:
+      weight: 1.0
\ No newline at end of file
diff --git a/egs2/dns_ins21/enh1/db.sh b/egs2/dns_ins21/enh1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/dns_ins21/enh1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/dns_ins21/enh1/enh.sh b/egs2/dns_ins21/enh1/enh.sh
new file mode 120000
index 00000000000..8fd33b0b191
--- /dev/null
+++ b/egs2/dns_ins21/enh1/enh.sh
@@ -0,0 +1 @@
+../../TEMPLATE/enh1/enh.sh
\ No newline at end of file
diff --git a/egs2/dns_ins21/enh1/local/data.sh b/egs2/dns_ins21/enh1/local/data.sh
new file mode 100755
index 00000000000..229ab7ecc5a
--- /dev/null
+++ b/egs2/dns_ins21/enh1/local/data.sh
@@ -0,0 +1,59 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+help_message=$(cat << EOF
+Usage: $0 [--stage <stage>] [--stop_stage <stop_stage>]
+  optional argument:
+    [--stage]: 1 (default) or 2
+    [--stop_stage]: 1 or 2 (default)
+    NOTE:
+        stage 1: Create the Data Mixture from the DNS scripts. You can skip this step when you already have the audio mixture for training.
+        stage 2: Prepare the data for ESPnet-se
+        You can get scripts by git clone -b interspeech2021/adddata https://github.com/microsoft/DNS-Challenge.git DNS-Challenge
+        You can download the data by using download-dns-challenge-3.sh in the master branch without git lfs
+        In addition, "datasets/wideband/acoustic_params_wideband" and "datasets/wideband/dev_testset_wideband/track1" are required, which are not downloaded by the above script
+        To avoid issues related to hard-coded paths, please change the current directory to DNS-Challenge in noisyspeech_synthesizer_singleprocess.py
+        Also, please make sure the destination is under data/dns_wav
+EOF
+)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+stage=1
+stop_stage=2
+dns_wav=$PWD/data/dns_wav
+
+
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    echo "${help_message}"
+    exit 1;
+fi
+
+if [ ! -e "${DNS3}" ]; then
+    log "Fill the value of 'DNS3' of db.sh"
+    exit 1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: Data Simulation"
+    local/dns_create_mixture.sh ${DNS3} ${dns_wav}  || exit 1;
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: Data preparation"
+    # The following datasets will be created:
+    # {tr,cv}_synthetic tt_synthetic_track_1
+    local/dns_data_prep.sh  ${dns_wav} ${DNS3}/datasets/dev_testset_wideband || exit 1;
+fi
\ No newline at end of file
diff --git a/egs2/dns_ins21/enh1/local/dns_create_mixture.sh b/egs2/dns_ins21/enh1/local/dns_create_mixture.sh
new file mode 100755
index 00000000000..573a73b0f9e
--- /dev/null
+++ b/egs2/dns_ins21/enh1/local/dns_create_mixture.sh
@@ -0,0 +1,75 @@
+#!/usr/bin/env bash
+
+. utils/parse_options.sh
+. path.sh
+. cmd.sh
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 <dns> <dns_wav> "
+  echo " where <dns> is dns directory,"
+  echo " <dns_wav> is wav generation space."
+  exit 1;
+fi
+
+dns=$1
+dns_wav=$2
+
+rm -r data/ 2>/dev/null || true
+mkdir -p data/
+
+
+# modify path in the original noisyspeech_synthesizer.cfg
+configure=${dns}/noisyspeech_synthesizer.cfg
+train_cfg=data/noisyspeech_synthesizer.cfg
+
+
+if [ ! -f ${configure} ]; then
+  echo -e "Please check configurtion ${configure} exist"
+  exit 1;
+fi
+
+# input datas
+noise_dir=${dns}/datasets/noise
+speech_dir=${dns}/datasets/clean/read_speech
+
+# additional clean datas
+clean_singing=${dns}/datasets/clean/singing_voice
+clean_emotion=${dns}/datasets/clean/emotional_speech
+clean_mandarin=${dns}/datasets/clean/mandarin_speech
+
+# acoustic params
+rir_table_csv=${dns}/datasets/acoustic_params_wideband/RIR_table_simple.csv
+clean_speech_t60_csv=${dns}/datasets/acoustic_params_wideband/cleanspeech_table_t60_c50.csv
+
+# outputs
+noisy_wav=${dns_wav}/noisy
+clean_wav=${dns_wav}/clean
+noise_wav=${dns_wav}/noise
+log_dir=log
+
+# modify the input paths for "\" separated paths
+sed -e "/^noisy_destination/s#.*#noisy_destination:${noisy_wav}#g"  \
+    -e "/^clean_destination/s#.*#clean_destination:${clean_wav}#g"  \
+    -e "/^noise_destination/s#.*#noise_destination:${noise_wav}#g"  \
+    -e "/^noise_dir/s#.*#noise_dir:${noise_dir}#g"  \
+    -e "/^speech_dir/s#.*#speech_dir:${speech_dir}#g"  \
+    -e "/^clean_singing/s#.*#clean_singing:${clean_singing}#g"  \
+    -e "/^clean_emotion/s#.*#clean_emotion:${clean_emotion}#g"  \
+    -e "/^clean_mandarin/s#.*#clean_mandarin:${clean_mandarin}#g"  \
+    -e "/^rir_table_csv/s#.*#rir_table_csv:${rir_table_csv}#g"  \
+    -e "/^clean_speech_t60_csv/s#.*#clean_speech_t60_csv:${clean_speech_t60_csv}#g"  \
+    -e "/^log_dir/s#.*#log_dir:${log_dir}#g" ${configure} \
+  > ${train_cfg}
+
+# modify the path separator 
+sed -i -e 's:\\:/:g' ${rir_table_csv}
+
+mix_script=${dns}/noisyspeech_synthesizer_singleprocess.py
+
+if [ ! -f ${configure} -a -f ${mix_script} ]; then
+  echo -e "Please check configurtion ${configure} and mix_script ${mix_script} exist"
+  exit 1;
+fi
+
+echo "Creating Mixtures for Training and Validation Data."
+python ${mix_script} --cfg ${PWD}/${train_cfg} >/dev/null || exit 1;
\ No newline at end of file
diff --git a/egs2/dns_ins21/enh1/local/dns_data_prep.sh b/egs2/dns_ins21/enh1/local/dns_data_prep.sh
new file mode 100755
index 00000000000..9ca11a4af1b
--- /dev/null
+++ b/egs2/dns_ins21/enh1/local/dns_data_prep.sh
@@ -0,0 +1,154 @@
+#!/usr/bin/env bash
+
+. ./path.sh
+
+
+
+if [ $# -ne 2 ]; then
+  echo "Arguments should be DNS script path, DNS wav path and DNS data, see local/data.sh for example."
+  exit 1;
+fi
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+dns_wav=$1
+dns_test_wav=$2
+
+# check if the wav dirs exist.
+for ddir in clean noise noisy; do
+  f=${dns_wav}/${ddir}
+  if [ ! -d $f ]; then
+    echo "Error: $f is not a directory."
+    exit 1;
+  fi
+done
+
+#Synthetic test data
+for rev in track_1; do
+  for ddir in clean noisy; do
+    f=${dns_test_wav}/${rev}/synthetic/${ddir}
+    if [ ! -d $f ]; then
+      echo "Error: $f is not a directory."
+      exit 1;
+    fi
+  done
+done
+
+#Real_recordings test data
+f=${dns_test_wav}
+if [ ! -d $f ]; then
+  echo "Error: $f is not a directory."
+  exit 1;
+fi
+
+data=./data
+rm -r ${data}/{tr, cv}_synthetic 2>/dev/null || true
+rm -r ${data}/tt_synthetic 2>/dev/null || true
+
+tmpdir=data/temp
+rm -r  $tmpdir 2>/dev/null || true
+mkdir -p $tmpdir 
+
+mixwav_dir=${dns_wav}/noisy
+
+find $mixwav_dir -iname '*.wav' | sort -u > $tmpdir/train_valid.flist
+
+sed -e 's:.*_\([0-9]*\).wav$:fileid_\1:i' $tmpdir/train_valid.flist \
+> $tmpdir/train_valid.uttids
+
+paste $tmpdir/train_valid.uttids $tmpdir/train_valid.flist \
+| sort -k1,1 >  $tmpdir/train_valid.scp
+
+num=$(wc -l $tmpdir/train_valid.scp | awk '{print $1}')
+train_num=$(($num*9/10))
+
+echo "Split 10% of the Training data to the Validation data"
+awk "NR<=$train_num" $tmpdir/train_valid.scp > $tmpdir/tr.scp
+awk "NR>$train_num" $tmpdir/train_valid.scp > $tmpdir/cv.scp
+
+for x in tr cv; do
+  ddir=${x}_synthetic
+  mkdir -p ${data}/${ddir}
+  cp $tmpdir/${x}.scp ${data}/${ddir}/wav.scp
+  
+  awk '{print($1, "dummy")}' ${data}/${ddir}/wav.scp | \
+    sort -u> ${data}/${ddir}/utt2spk
+  utt2spk_to_spk2utt.pl ${data}/${ddir}/utt2spk > ${data}/${ddir}/spk2utt
+
+  awk '{print($1, "dummy")}' ${data}/${ddir}/wav.scp | \
+    sort -u> ${data}/${ddir}/text
+
+  noise_wav_dir=${dns_wav}/noise/
+  sed -e "s#${mixwav_dir}.*_\(.*\).wav#${noise_wav_dir}noise_fileid_\1.wav#g" ${data}/${ddir}/wav.scp \
+    > ${data}/${ddir}/noise1.scp
+
+  spk1_wav_dir=${dns_wav}/clean/
+  sed -e "s#${mixwav_dir}.*_\(.*\).wav#${spk1_wav_dir}clean_fileid_\1.wav#g" ${data}/${ddir}/wav.scp \
+    > ${data}/${ddir}/spk1.scp
+done
+
+
+echo "Building testing data"
+
+for x in tt; do
+  echo "Building synthetic testing data"
+  for rev in track_1; do
+    ddir=${x}_synthetic_${rev}
+    mkdir -p ${data}/${ddir}
+    root_dir=${dns_test_wav}/${rev}/synthetic
+
+    mixwav_dir=${root_dir}/noisy/
+    find $mixwav_dir -iname '*.wav' > $tmpdir/${x}_${rev}.flist
+
+    sed -e 's:.*\/\(.*\).wav$:\1:i' $tmpdir/${x}_${rev}.flist \
+    > $tmpdir/${x}_${rev}.uttids    
+
+    paste $tmpdir/${x}_${rev}.uttids $tmpdir/${x}_${rev}.flist \
+    | sort -k1,1 >  ${data}/${ddir}/wav.scp 
+
+
+    awk '{print($1, "dummy")}' ${data}/${ddir}/wav.scp | \
+    sort -u> ${data}/${ddir}/utt2spk
+
+
+    awk '{print($1, "dummy")}' ${data}/${ddir}/wav.scp | \
+    sort -u> ${data}/${ddir}/text
+
+    utt2spk_to_spk2utt.pl ${data}/${ddir}/utt2spk > ${data}/${ddir}/spk2utt
+
+    spk1_wav_dir=${root_dir}/clean/
+    awk '{i=split($1, lst, "_"); spk="'"$spk1_wav_dir"'"lst[1]"_"lst[2]"_clean_fileid_"lst[i]".wav";
+    gsub("_german","",spk); gsub("_french","",spk); gsub("_italian","",spk);
+    print($1, spk)}' ${data}/${ddir}/wav.scp \
+    > ${data}/${ddir}/spk1.scp
+  done
+
+  echo "Building real testing data"
+  ddir=${x}_real_recordings
+  mkdir -p ${data}/${ddir}
+  real_dir=${dns_test_wav}
+
+  find $real_dir -maxdepth 1 -name '*.wav' > $tmpdir/${x}_real_recordings.flist
+  
+  sed -e 's:.*\/\(.*\).wav$:\1:i' $tmpdir/${x}_real_recordings.flist \
+  > $tmpdir/${x}_real_recordings.uttids
+
+  paste $tmpdir/${x}_real_recordings.uttids $tmpdir/${x}_real_recordings.flist \
+  | sort -k1,1 >  ${data}/${ddir}/wav.scp 
+
+
+  awk '{print($1, "dummy")}' ${data}/${ddir}/wav.scp | \
+  sort -u> ${data}/${ddir}/utt2spk
+
+
+  awk '{print($1, "dummy")}' ${data}/${ddir}/wav.scp | \
+  sort -u> ${data}/${ddir}/text
+
+  utt2spk_to_spk2utt.pl ${data}/${ddir}/utt2spk > ${data}/${ddir}/spk2utt
+
+
+done
\ No newline at end of file
diff --git a/egs2/dns_ins21/enh1/local/path.sh b/egs2/dns_ins21/enh1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/dns_ins21/enh1/path.sh b/egs2/dns_ins21/enh1/path.sh
new file mode 120000
index 00000000000..eb217d35673
--- /dev/null
+++ b/egs2/dns_ins21/enh1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/enh1/path.sh
\ No newline at end of file
diff --git a/egs2/dns_ins21/enh1/pyscripts b/egs2/dns_ins21/enh1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/dns_ins21/enh1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/dns_ins21/enh1/run.sh b/egs2/dns_ins21/enh1/run.sh
new file mode 100755
index 00000000000..b2dda44031e
--- /dev/null
+++ b/egs2/dns_ins21/enh1/run.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+sample_rate=16k
+
+
+train_set=tr_synthetic
+valid_set=cv_synthetic
+test_sets="tt_synthetic_track_1"
+
+./enh.sh \
+    --lang en \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --fs ${sample_rate} \
+    --ngpu 1 \
+    --spk_num 1 \
+    --local_data_opts "" \
+    --enh_config ./conf/train.yaml \
+    --use_dereverb_ref false \
+    --use_noise_ref false \
+    --max_wav_duration 31 \
+    --inference_model "valid.loss.best.pth" \
+    "$@"
diff --git a/egs2/dns_ins21/enh1/scripts b/egs2/dns_ins21/enh1/scripts
new file mode 120000
index 00000000000..9aeb3f26509
--- /dev/null
+++ b/egs2/dns_ins21/enh1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/enh1/scripts
\ No newline at end of file
diff --git a/egs2/dns_ins21/enh1/steps b/egs2/dns_ins21/enh1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/dns_ins21/enh1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/dns_ins21/enh1/utils b/egs2/dns_ins21/enh1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/dns_ins21/enh1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/st1/conf/decode_streaming_st.yaml b/egs2/fisher_callhome_spanish/st1/conf/decode_streaming_st.yaml
new file mode 100644
index 00000000000..9bbdb6347bd
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/conf/decode_streaming_st.yaml
@@ -0,0 +1,5 @@
+batch_size: 1
+beam_size: 10
+nbest: 1
+lm_weight: 0.0
+sim_chunk_length: 5120
diff --git a/egs2/fisher_callhome_spanish/st1/conf/train_st_streaming.yaml b/egs2/fisher_callhome_spanish/st1/conf/train_st_streaming.yaml
new file mode 100644
index 00000000000..35c149ded9b
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/conf/train_st_streaming.yaml
@@ -0,0 +1,95 @@
+# network architecture
+
+# frontend related
+frontend: default
+frontend_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 160
+
+# encoder related
+# encoder related
+encoder: contextual_block_transformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    block_size: 40
+    hop_size: 16
+    look_ahead: 16
+    init_average: true
+    ctx_pos_enc: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: embed
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+extra_asr_decoder: transformer
+extra_asr_decoder_conf:
+    input_layer: embed
+    num_blocks: 2
+    linear_units: 2048
+    dropout_rate: 0.1
+
+extra_mt_decoder: transformer
+extra_mt_decoder_conf:
+    input_layer: embed
+    num_blocks: 2
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# loss related
+model_conf:
+    asr_weight: 0.3
+    mt_weight: 0.0
+    mtlalpha: 1.0
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# optimization related
+optim: adam
+accum_grad: 1
+grad_clip: 3
+max_epoch: 50
+optim_conf:
+    lr: 2.5
+scheduler: noamlr
+scheduler_conf:
+    model_size: 256
+    warmup_steps: 25000
+
+# minibatch related
+batch_type: folded
+batch_size: 128
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+num_att_plot: 0
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/fisher_callhome_spanish/st1/run.sh b/egs2/fisher_callhome_spanish/st1/run.sh
index e48d8924263..e07cb996cc6 100755
--- a/egs2/fisher_callhome_spanish/st1/run.sh
+++ b/egs2/fisher_callhome_spanish/st1/run.sh
@@ -22,6 +22,7 @@ src_case=lc.rm
 tgt_case=lc.rm
 
 ./st.sh \
+    --use_streaming false \
     --local_data_opts "--stage 0" \
     --audio_format "flac.ark" \
     --use_lm false \
diff --git a/egs2/fsc/asr1/conf/train_asr_streaming_transformer.yaml b/egs2/fsc/asr1/conf/train_asr_streaming_transformer.yaml
new file mode 100644
index 00000000000..7a344c514cf
--- /dev/null
+++ b/egs2/fsc/asr1/conf/train_asr_streaming_transformer.yaml
@@ -0,0 +1,58 @@
+# network architecture
+# encoder related
+encoder: contextual_block_transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+    block_size: 40
+    hop_size: 16
+    look_ahead: 16
+    init_average: true
+    ctx_pos_enc: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# minibatch related
+batch_type: folded
+batch_size: 64
+max_epoch: 200
+keep_nbest_models: 5
+
+optim: adam
+optim_conf:
+    lr: 0.0002
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
+num_att_plot: 0
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/harpervalley/asr1/README.md b/egs2/harpervalley/asr1/README.md
new file mode 100644
index 00000000000..e40e1a41589
--- /dev/null
+++ b/egs2/harpervalley/asr1/README.md
@@ -0,0 +1,28 @@
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+# RESULTS
+## Environments
+- date: `Sat Apr 23 12:20:53 EDT 2022`
+- python version: `3.7.11 (default, Jul 27 2021, 14:32:16)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.9.0+cu102`
+- Git hash: `0b1d15ebe0c36efcdf06d1b2e32361e3c8846cf6`
+  - Commit date: `Tue Apr 5 10:46:40 2022 -0400`
+- Pretrained Model: https://huggingface.co/espnet/YushiUeda_harpervalley_train_asr_hubert_raw_en_word
+
+## asr_train_asr_raw_en_word
+- config [conf/tuning/train_asr.yaml](conf/tuning/train_asr.yaml)
+- token_type word
+
+|dataset|Snt|Intent Classification (%)|
+|---|---|---|
+|inference_asr_model_valid.acc.ave_10best/test|1028|37.9|
+|inference_asr_model_valid.acc.ave_10best/valid|1430|35.5|
+
+## asr_train_asr_hubert_raw_en_word
+- config [conf/tuning/train_asr_hubert.yaml](conf/tuning/train_asr_hubert.yaml)
+- token_type word
+
+|dataset|Snt|Intent Classification (%)|
+|---|---|---|
+|inference_asr_model_valid.acc.ave_10best/test|1028|46.0|
+|inference_asr_model_valid.acc.ave_10best/valid|1430|46.6|
diff --git a/egs2/harpervalley/asr1/asr.sh b/egs2/harpervalley/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/harpervalley/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/harpervalley/asr1/cmd.sh b/egs2/harpervalley/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/harpervalley/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/harpervalley/asr1/conf/decode_asr.yaml b/egs2/harpervalley/asr1/conf/decode_asr.yaml
new file mode 100644
index 00000000000..e2408c35e82
--- /dev/null
+++ b/egs2/harpervalley/asr1/conf/decode_asr.yaml
@@ -0,0 +1 @@
+beam_size: 1
\ No newline at end of file
diff --git a/egs2/harpervalley/asr1/conf/fbank.conf b/egs2/harpervalley/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/harpervalley/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/harpervalley/asr1/conf/pbs.conf b/egs2/harpervalley/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/harpervalley/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/harpervalley/asr1/conf/pitch.conf b/egs2/harpervalley/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/harpervalley/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/harpervalley/asr1/conf/queue.conf b/egs2/harpervalley/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/harpervalley/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/harpervalley/asr1/conf/slurm.conf b/egs2/harpervalley/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/harpervalley/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/harpervalley/asr1/conf/train_asr.yaml b/egs2/harpervalley/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..a7463d73f0e
--- /dev/null
+++ b/egs2/harpervalley/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+tuning/train_asr.yaml
\ No newline at end of file
diff --git a/egs2/harpervalley/asr1/conf/tuning/train_asr.yaml b/egs2/harpervalley/asr1/conf/tuning/train_asr.yaml
new file mode 100644
index 00000000000..ac943f876ac
--- /dev/null
+++ b/egs2/harpervalley/asr1/conf/tuning/train_asr.yaml
@@ -0,0 +1,49 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+optim: adam
+optim_conf:
+    lr: 0.0002
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
+max_epoch: 50
+keep_nbest_models: 10
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+    
\ No newline at end of file
diff --git a/egs2/harpervalley/asr1/conf/tuning/train_asr_hubert.yaml b/egs2/harpervalley/asr1/conf/tuning/train_asr_hubert.yaml
new file mode 100644
index 00000000000..49867ef89b7
--- /dev/null
+++ b/egs2/harpervalley/asr1/conf/tuning/train_asr_hubert.yaml
@@ -0,0 +1,85 @@
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+optim: adam
+optim_conf:
+    lr: 0.0002
+scheduler: warmuplr     # pytorch v1.1.0+ required #Tune warmup steps
+scheduler_conf:
+    warmup_steps: 25000
+max_epoch: 50
+
+freeze_param: [
+"frontend.upstream"
+]
+
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+frontend: s3prl
+frontend_conf:
+    frontend_conf:
+        upstream: hubert_large_ll60k  # Note: If the upstream is changed, please change the input_size in the preencoder.
+    download_dir: ./hub
+    multilayer_feature: True
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 80
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
\ No newline at end of file
diff --git a/egs2/harpervalley/asr1/db.sh b/egs2/harpervalley/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/harpervalley/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/harpervalley/asr1/local/data.sh b/egs2/harpervalley/asr1/local/data.sh
new file mode 100755
index 00000000000..519dc20d399
--- /dev/null
+++ b/egs2/harpervalley/asr1/local/data.sh
@@ -0,0 +1,73 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+
+stage=1
+stop_stage=100000
+log "$0 $*"
+. utils/parse_options.sh
+
+. ./db.sh
+. ./path.sh
+. ./cmd.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+if [ -z "${HARPERVALLEY}" ]; then
+    log "Fill the value of 'HARPERVALLEY' of db.sh"
+    exit 1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    if [ ! -e "${HARPERVALLEY}/LICENSE" ]; then
+        echo "stage 1: Download data to ${HARPERVALLEY}"
+        mkdir -p ${HARPERVALLEY}
+        git clone https://github.com/cricketclub/gridspace-stanford-harper-valley.git ${HARPERVALLEY}
+    else
+        log "stage 1: ${HARPERVALLEY}/LICENSE is already existing. Skip data downloading"
+    fi
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage2 : Data Preparation"
+    if [ -n "$(ls data/tmp/)" ]; then
+        rm -r data/tmp/
+    fi
+    for file in "${HARPERVALLEY}"/data/transcript/*.json; do
+        filename=$(basename "${file%.*}")
+        dirname="${HARPERVALLEY}/data/"
+        python3 local/data_prep.py --source_dir "$dirname" \
+            --audio_dir "data/audio" \
+            --filename "$filename" \
+            --target_dir "data/tmp" \
+            --min_length 4
+    done
+    sed -i -e 's/<unk>/\[unk\]/g' data/tmp/text
+    mkdir -p data/{train,valid,test}
+    python3 local/split_data.py --source_dir "data/tmp" \
+        --min_spk_utt 10 \
+        --train_frac 0.8 \
+        --val_frac 0.1
+    for x in test valid train; do
+        for f in text wav.scp utt2spk segments; do
+            sort data/${x}/${f} -o data/${x}/${f}
+        done
+        utils/utt2spk_to_spk2utt.pl data/${x}/utt2spk > "data/${x}/spk2utt"
+        utils/validate_data_dir.sh --no-feats data/${x} || exit 1
+    done
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/harpervalley/asr1/local/data_prep.py b/egs2/harpervalley/asr1/local/data_prep.py
new file mode 100755
index 00000000000..7e404ad2249
--- /dev/null
+++ b/egs2/harpervalley/asr1/local/data_prep.py
@@ -0,0 +1,97 @@
+import argparse
+import json
+import os
+import sys
+import wave
+
+
+def load_json(f_path):
+    with open(f_path, "r") as f:
+        return json.load(f)
+
+
+def process_data(target_dir, source_dir, audio_dir, filename, min_length):
+    if not os.path.exists(target_dir):
+        os.makedirs(target_dir)
+
+    agent_wav_path = os.path.join(audio_dir, "agent", filename + ".wav")
+    caller_wav_path = os.path.join(audio_dir, "caller", filename + ".wav")
+    # exit if the wav files do not exist.
+    if not os.path.isfile(agent_wav_path) or not os.path.isfile(agent_wav_path):
+        sys.exit()
+
+    with wave.open(agent_wav_path, "rb") as wa, wave.open(caller_wav_path, "rb") as wc:
+        wa_length = wa.getnframes() / wa.getframerate()
+        wc_length = wc.getnframes() / wc.getframerate()
+
+    with open(
+        os.path.join(target_dir, "wav.scp"), "a", encoding="utf-8"
+    ) as wavscp, open(
+        os.path.join(target_dir, "utt2spk"), "a", encoding="utf-8"
+    ) as utt2spk, open(
+        os.path.join(target_dir, "segments"), "a", encoding="utf-8"
+    ) as segments, open(
+        os.path.join(target_dir, "text"), "a", encoding="utf-8"
+    ) as text:
+
+        metadata_f = load_json(os.path.join(source_dir, "metadata", filename + ".json"))
+        transcript_f = load_json(
+            os.path.join(source_dir, "transcript", filename + ".json")
+        )
+
+        agent_spk_id = metadata_f["agent"]["speaker_id"]
+        caller_spk_id = metadata_f["caller"]["speaker_id"]
+        task_type = metadata_f["tasks"][0]["task_type"].replace(" ", "_")
+        agent_rec_id = "{}-{}".format(agent_spk_id, filename)
+        caller_rec_id = "{}-{}".format(caller_spk_id, filename)
+
+        agent_utt_num = 0
+        caller_utt_num = 0
+        for v in transcript_f:
+            transcript = v["human_transcript"]
+            # Throw away utterances with < min_length words or 100 ms
+            if len(transcript.split()) < min_length or int(v["duration_ms"]) < 100:
+                continue
+            begin_ms = int(v["offset_ms"])
+            end_ms = begin_ms + int(v["duration_ms"])
+            begin_sec = begin_ms / 1000
+            end_sec = end_ms / 1000
+            if v["speaker_role"] == "agent":
+                if end_sec > wa_length:
+                    continue
+                utt_id = "{}_{}_{}".format(agent_rec_id, begin_ms, end_ms)
+                utt2spk.write("{} {}\n".format(utt_id, agent_spk_id))
+                segments.write(
+                    "{} {} {} {}\n".format(utt_id, agent_rec_id, begin_sec, end_sec)
+                )
+                agent_utt_num += 1
+            else:
+                if end_sec > wc_length:
+                    continue
+                utt_id = "{}_{}_{}".format(caller_rec_id, begin_ms, end_ms)
+                utt2spk.write("{} {}\n".format(utt_id, caller_spk_id))
+                segments.write(
+                    "{} {} {} {}\n".format(utt_id, caller_rec_id, begin_sec, end_sec)
+                )
+                caller_utt_num += 1
+            text.write("{} {} {}\n".format(utt_id, task_type, transcript))
+
+        # write wav.scp only if utterances exist
+        if agent_utt_num > 0:
+            wavscp.write("{} {}\n".format(agent_rec_id, agent_wav_path))
+        if caller_utt_num > 0:
+            wavscp.write("{} {}\n".format(caller_rec_id, caller_wav_path))
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--target_dir", type=str, default="data/tmp")
+parser.add_argument("--source_dir", type=str, required=True, help="Path to source data")
+parser.add_argument("--audio_dir", type=str, required=True, help="Path to audio data")
+parser.add_argument("--filename", type=str, required=True, help="filename")
+parser.add_argument("--min_length", type=int, default=4)
+
+args = parser.parse_args()
+
+process_data(
+    args.target_dir, args.source_dir, args.audio_dir, args.filename, args.min_length
+)
diff --git a/egs2/harpervalley/asr1/local/path.sh b/egs2/harpervalley/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/harpervalley/asr1/local/score.py b/egs2/harpervalley/asr1/local/score.py
new file mode 120000
index 00000000000..673506176eb
--- /dev/null
+++ b/egs2/harpervalley/asr1/local/score.py
@@ -0,0 +1 @@
+../../../slurp/asr1/local/score.py
\ No newline at end of file
diff --git a/egs2/harpervalley/asr1/local/score.sh b/egs2/harpervalley/asr1/local/score.sh
new file mode 120000
index 00000000000..eee438437a6
--- /dev/null
+++ b/egs2/harpervalley/asr1/local/score.sh
@@ -0,0 +1 @@
+../../../slurp/asr1/local/score.sh
\ No newline at end of file
diff --git a/egs2/harpervalley/asr1/local/split_data.py b/egs2/harpervalley/asr1/local/split_data.py
new file mode 100644
index 00000000000..9b75ec72adb
--- /dev/null
+++ b/egs2/harpervalley/asr1/local/split_data.py
@@ -0,0 +1,125 @@
+import argparse
+import os
+import random
+
+
+def get_spk_list(utt2spk_path, min_spk_utt=10, train_frac=0.8, val_frac=0.1):
+    # collect spk_id and count
+    with open(utt2spk_path, "r", encoding="utf-8") as utt2spk:
+        spk_dict = {}
+        for line in utt2spk.readlines():
+            spk_id = line.strip().split()[1]
+            if spk_id in spk_dict:
+                spk_dict[spk_id] += 1
+            else:
+                spk_dict[spk_id] = 1
+
+    # create spk_list
+    spk_list = []
+    for spk_id, count in spk_dict.items():
+        if count > min_spk_utt:
+            spk_list.append(spk_id)
+
+    # split spk_list
+    spk_list = sorted(spk_list)
+    num_speaker = len(spk_list)
+    num_train = int(train_frac * num_speaker)
+    num_val = int(val_frac * num_speaker)
+    random.seed(42)
+    random.shuffle(spk_list)
+
+    train_spk_list = spk_list[:num_train]
+    dev_spk_list = spk_list[num_train : num_train + num_val]
+    test_spk_list = spk_list[num_train + num_val :]
+
+    return train_spk_list, dev_spk_list, test_spk_list
+
+
+def split_files(source_dir, min_spk_utt, train_frac, val_frac):
+    wavscp_train = open("data/train/wav.scp", "w", encoding="utf-8")
+    utt2spk_train = open("data/train/utt2spk", "w", encoding="utf-8")
+    segments_train = open("data/train/segments", "w", encoding="utf-8")
+    text_train = open("data/train/text", "w", encoding="utf-8")
+    wavscp_dev = open("data/valid/wav.scp", "w", encoding="utf-8")
+    utt2spk_dev = open("data/valid/utt2spk", "w", encoding="utf-8")
+    segments_dev = open("data/valid/segments", "w", encoding="utf-8")
+    text_dev = open("data/valid/text", "w", encoding="utf-8")
+    wavscp_test = open("data/test/wav.scp", "w", encoding="utf-8")
+    utt2spk_test = open("data/test/utt2spk", "w", encoding="utf-8")
+    segments_test = open("data/test/segments", "w", encoding="utf-8")
+    text_test = open("data/test/text", "w", encoding="utf-8")
+
+    train_spk_list, dev_spk_list, test_spk_list = get_spk_list(
+        os.path.join(source_dir, "utt2spk"), min_spk_utt, train_frac, val_frac
+    )
+
+    # split wav.scp
+    with open(os.path.join(source_dir, "wav.scp"), "r", encoding="utf-8") as wavscp:
+        for line in wavscp.readlines():
+            rec_id = line.strip().split()[0]
+            spk_id = rec_id.split("-")[0]
+            if spk_id in train_spk_list:
+                wavscp_train.write("{}\n".format(line.strip()))
+            elif spk_id in dev_spk_list:
+                wavscp_dev.write("{}\n".format(line.strip()))
+            elif spk_id in test_spk_list:
+                wavscp_test.write("{}\n".format(line.strip()))
+
+    # split utt2spk
+    with open(os.path.join(source_dir, "utt2spk"), "r", encoding="utf-8") as utt2spk:
+        for line in utt2spk.readlines():
+            spk_id = line.strip().split()[1]
+            if spk_id in train_spk_list:
+                utt2spk_train.write("{}\n".format(line.strip()))
+            elif spk_id in dev_spk_list:
+                utt2spk_dev.write("{}\n".format(line.strip()))
+            elif spk_id in test_spk_list:
+                utt2spk_test.write("{}\n".format(line.strip()))
+
+    # split segments
+    with open(os.path.join(source_dir, "segments"), "r", encoding="utf-8") as segments:
+        for line in segments.readlines():
+            utt_id = line.strip().split()[0]
+            spk_id = utt_id.split("-")[0]
+            if spk_id in train_spk_list:
+                segments_train.write("{}\n".format(line.strip()))
+            elif spk_id in dev_spk_list:
+                segments_dev.write("{}\n".format(line.strip()))
+            elif spk_id in test_spk_list:
+                segments_test.write("{}\n".format(line.strip()))
+
+    # split text
+    with open(os.path.join(source_dir, "text"), "r", encoding="utf-8") as text:
+        for line in text.readlines():
+            utt_id = line.strip().split()[0]
+            spk_id = utt_id.split("-")[0]
+            if spk_id in train_spk_list:
+                text_train.write("{}\n".format(line.strip()))
+            elif spk_id in dev_spk_list:
+                text_dev.write("{}\n".format(line.strip()))
+            elif spk_id in test_spk_list:
+                text_test.write("{}\n".format(line.strip()))
+
+    wavscp_train.close()
+    utt2spk_train.close()
+    segments_train.close()
+    text_train.close()
+    wavscp_dev.close()
+    utt2spk_dev.close()
+    segments_dev.close()
+    text_dev.close()
+    wavscp_test.close()
+    utt2spk_test.close()
+    segments_test.close()
+    text_test.close()
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--source_dir", type=str, default="data/tmp")
+parser.add_argument("--min_spk_utt", type=int, default=10)
+parser.add_argument("--train_frac", type=float, default=0.8)
+parser.add_argument("--val_frac", type=float, default=0.1)
+
+args = parser.parse_args()
+
+split_files(args.source_dir, args.min_spk_utt, args.train_frac, args.val_frac)
diff --git a/egs2/harpervalley/asr1/path.sh b/egs2/harpervalley/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/harpervalley/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/harpervalley/asr1/pyscripts b/egs2/harpervalley/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/harpervalley/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/harpervalley/asr1/run.sh b/egs2/harpervalley/asr1/run.sh
new file mode 100755
index 00000000000..a35d91ec261
--- /dev/null
+++ b/egs2/harpervalley/asr1/run.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set="train"
+valid_set="valid"
+test_sets="test valid"
+
+asr_config=conf/train_asr.yaml
+
+./asr.sh \
+    --lang en \
+    --ngpu 1 \
+    --use_lm false \
+    --nbpe 5000 \
+    --bpe_nlsyms "[unk]" \
+    --token_type word\
+    --audio_format flac\
+    --feats_type raw\
+    --max_wav_duration 30 \
+    --inference_asr_model valid.acc.ave_10best.pth\
+    --asr_config "${asr_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" "$@"
\ No newline at end of file
diff --git a/egs2/harpervalley/asr1/scripts b/egs2/harpervalley/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/harpervalley/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/harpervalley/asr1/steps b/egs2/harpervalley/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/harpervalley/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/harpervalley/asr1/utils b/egs2/harpervalley/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/harpervalley/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/librispeech/asr1/README.md b/egs2/librispeech/asr1/README.md
index ddcb14fce05..99bde4c1cad 100644
--- a/egs2/librispeech/asr1/README.md
+++ b/egs2/librispeech/asr1/README.md
@@ -1,3 +1,37 @@
+# Conformer-RNN Transducer
+
+## Environments
+- date: `Fri Mar 25 04:35:42 EDT 2022`
+- python version: `3.9.5 (default, Jun  4 2021, 12:28:51)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.8.1+cu111`
+- Git hash: `21d19be00089678ca27f7fce474ef8d787689512`
+  - Commit date: `Wed Mar 16 08:06:52 2022 -0400`
+- ASR config: [conf/train_rnnt_conformer.yaml](conf/train_rnnt_conformer.yaml)
+- Pretrained model: [https://huggingface.co/espnet/chai_librispeech_asr_train_rnnt_conformer_raw_en_bpe5000_sp](https://huggingface.co/espnet/chai_librispeech_asr_train_rnnt_conformer_raw_en_bpe5000_sp)
+
+## asr_train_rnnt_conformer_ngpu4_raw_en_bpe5000_sp
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_rnnt_conformer_asr_model_valid.loss.ave_10best/test_clean|2620|52576|97.2|2.5|0.3|0.3|3.1|35.2|
+|decode_rnnt_conformer_asr_model_valid.loss.ave_10best/test_other|2939|52343|93.4|6.0|0.6|0.8|7.4|56.3|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_rnnt_conformer_asr_model_valid.loss.ave_10best/test_clean|2620|281530|99.3|0.4|0.3|0.3|1.0|35.2|
+|decode_rnnt_conformer_asr_model_valid.loss.ave_10best/test_other|2939|272758|97.7|1.4|1.0|0.9|3.2|56.3|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_rnnt_conformer_asr_model_valid.loss.ave_10best/test_clean|2620|65818|96.6|2.4|1.0|0.5|3.9|35.2|
+|decode_rnnt_conformer_asr_model_valid.loss.ave_10best/test_other|2939|65101|92.1|5.9|2.0|1.3|9.2|56.3|
+
 # Self-supervised learning features [HuBERT_large_ll60k, Conformer, utt_mvn](conf/tuning/train_asr_conformer7_hubert_ll60k_large.yaml) with [Transformer-LM](conf/tuning/train_lm_transformer2.yaml)
 
 ## Environments
diff --git a/egs2/mediaspeech/asr1/README.md b/egs2/mediaspeech/asr1/README.md
new file mode 100644
index 00000000000..f75dda826d3
--- /dev/null
+++ b/egs2/mediaspeech/asr1/README.md
@@ -0,0 +1,34 @@
+# Hugging Face
+Model is available in Hugging Face: https://huggingface.co/espnet/mediaspeech-fr-hubert
+
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+# RESULTS
+## Environments
+- date: `Tue Mar 22 13:50:31 UTC 2022`
+- python version: `3.7.11 (default, Jul 27 2021, 14:32:16)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.10.1`
+- Git hash: `1991a25855821b8b61d775681aa0cdfd6161bbc8`
+  - Commit date: `Mon Mar 21 22:19:19 2022 +0800`
+
+## asr_train_asr_hubert_raw_as_bpe150_sp
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|inference_asr_model_valid.acc.ave/dev_as|249|10072|49.7|41.2|9.1|7.0|57.2|100.0|
+|inference_asr_model_valid.acc.ave/test_as|249|9920|51.1|40.1|8.9|6.5|55.4|100.0|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|inference_asr_model_valid.acc.ave/dev_as|249|58679|80.9|8.0|11.1|7.2|26.3|100.0|
+|inference_asr_model_valid.acc.ave/test_as|249|58694|82.1|7.2|10.8|7.1|25.0|100.0|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|inference_asr_model_valid.acc.ave/dev_as|249|30837|69.5|19.0|11.5|6.3|36.8|100.0|
+|inference_asr_model_valid.acc.ave/test_as|249|30942|70.7|17.9|11.4|6.0|35.3|100.0|
diff --git a/egs2/mediaspeech/asr1/asr.sh b/egs2/mediaspeech/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/mediaspeech/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/mediaspeech/asr1/cmd.sh b/egs2/mediaspeech/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/mediaspeech/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/mediaspeech/asr1/conf/decode_asr.yaml b/egs2/mediaspeech/asr1/conf/decode_asr.yaml
new file mode 120000
index 00000000000..f3f59d5ac2b
--- /dev/null
+++ b/egs2/mediaspeech/asr1/conf/decode_asr.yaml
@@ -0,0 +1 @@
+tuning/decode_transformer.yaml
\ No newline at end of file
diff --git a/egs2/mediaspeech/asr1/conf/fbank.conf b/egs2/mediaspeech/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/mediaspeech/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/mediaspeech/asr1/conf/pbs.conf b/egs2/mediaspeech/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/mediaspeech/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/mediaspeech/asr1/conf/pitch.conf b/egs2/mediaspeech/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/mediaspeech/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/mediaspeech/asr1/conf/queue.conf b/egs2/mediaspeech/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/mediaspeech/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/mediaspeech/asr1/conf/slurm.conf b/egs2/mediaspeech/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/mediaspeech/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/mediaspeech/asr1/conf/train_asr.yaml b/egs2/mediaspeech/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..00ea2f24791
--- /dev/null
+++ b/egs2/mediaspeech/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+tuning/train_asr_transformer.yaml
\ No newline at end of file
diff --git a/egs2/mediaspeech/asr1/conf/tuning/train_asr_conformer.yaml b/egs2/mediaspeech/asr1/conf/tuning/train_asr_conformer.yaml
new file mode 100644
index 00000000000..8ad9f0d693f
--- /dev/null
+++ b/egs2/mediaspeech/asr1/conf/tuning/train_asr_conformer.yaml
@@ -0,0 +1,70 @@
+cudnn_benchmark: false 
+cudnn_deterministic: false 
+use_amp: true 
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+optim: adam
+optim_conf:
+    lr: 0.0002
+scheduler: warmuplr     # pytorch v1.1.0+ required #Tune warmup steps
+scheduler_conf:
+    warmup_steps: 25000
+max_epoch: 50
+
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10 
\ No newline at end of file
diff --git a/egs2/mediaspeech/asr1/conf/tuning/train_asr_fused.yaml b/egs2/mediaspeech/asr1/conf/tuning/train_asr_fused.yaml
new file mode 100644
index 00000000000..38727e5a2bf
--- /dev/null
+++ b/egs2/mediaspeech/asr1/conf/tuning/train_asr_fused.yaml
@@ -0,0 +1,96 @@
+encoder: transformer
+encoder_conf:
+    input_layer: "conv2d"
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+
+decoder: transformer
+decoder_conf:
+    input_layer: "embed"
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+#frontend related 
+freeze_param: ["frontend.upstream"] 
+
+frontend: fused
+frontend_conf: 
+  frontends:
+    - frontend_type: s3prl
+      frontend_conf:
+        upstream: hubert_large_ll60k
+      download_dir: ./hub
+      multilayer_feature: True
+      
+    - frontend_type: default
+      n_fft: 512
+      win_length: 400
+      hop_length: 160
+
+    - frontend_type: s3prl
+      frontend_conf:
+        upstream: wav2vec2_large_ll60k
+      download_dir: ./hub
+      multilayer_feature: True
+
+  align_method: linear_projection
+  proj_dim: 100     
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 80
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.1
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+
+# minibatch related
+batch_type: folded
+batch_size: 32
+
+# optimization related
+optim: adam
+accum_grad: 2
+grad_clip: 5
+patience: 15
+max_epoch: 100
+optim_conf:
+    lr: 1
+scheduler: noamlr
+scheduler_conf:
+    warmup_steps: 4000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+# others:
+best_model_criterion:
+-  - valid
+   - acc
+   - max
+keep_nbest_models: 5
+
+init: xavier_uniform
diff --git a/egs2/mediaspeech/asr1/conf/tuning/train_asr_hubert.yaml b/egs2/mediaspeech/asr1/conf/tuning/train_asr_hubert.yaml
new file mode 100644
index 00000000000..db22456c95d
--- /dev/null
+++ b/egs2/mediaspeech/asr1/conf/tuning/train_asr_hubert.yaml
@@ -0,0 +1,86 @@
+cudnn_benchmark: false 
+cudnn_deterministic: false 
+use_amp: true 
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+optim: adam
+optim_conf:
+    lr: 0.0002
+scheduler: warmuplr     # pytorch v1.1.0+ required #Tune warmup steps
+scheduler_conf:
+    warmup_steps: 25000
+max_epoch: 50
+
+freeze_param: [
+"frontend.upstream"
+]
+
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+frontend: s3prl
+frontend_conf:
+    frontend_conf:
+        upstream: hubert_large_ll60k  # Note: If the upstream is changed, please change the input_size in the preencoder.
+    download_dir: ./hub
+    multilayer_feature: True
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 80
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10 
\ No newline at end of file
diff --git a/egs2/mediaspeech/asr1/conf/tuning/train_asr_transformer.yaml b/egs2/mediaspeech/asr1/conf/tuning/train_asr_transformer.yaml
new file mode 100644
index 00000000000..ac6366ffb1e
--- /dev/null
+++ b/egs2/mediaspeech/asr1/conf/tuning/train_asr_transformer.yaml
@@ -0,0 +1,48 @@
+encoder: transformer
+encoder_conf:
+    input_layer: "conv2d"
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+
+decoder: transformer
+decoder_conf:
+    input_layer: "embed"
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# minibatch related
+batch_type: folded
+batch_size: 32
+
+# optimization related
+optim: adam
+accum_grad: 2
+grad_clip: 5
+patience: 0
+max_epoch: 10
+optim_conf:
+    lr: 1.0
+scheduler: noamlr
+scheduler_conf:
+    warmup_steps: 25000
+
+# others:
+best_model_criterion:
+-  - valid
+   - acc
+   - max
+keep_nbest_models: 10
+
+init: xavier_uniform # Our empirical studies shows that this initialization
+                     # is very important to low-resource ASR training
diff --git a/egs2/mediaspeech/asr1/db.sh b/egs2/mediaspeech/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/mediaspeech/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/mediaspeech/asr1/local/data.sh b/egs2/mediaspeech/asr1/local/data.sh
new file mode 100755
index 00000000000..a8284fdeb88
--- /dev/null
+++ b/egs2/mediaspeech/asr1/local/data.sh
@@ -0,0 +1,62 @@
+#!/usr/bin/env bash
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+# general configuration
+stage=0       # start from 0 if you need to start from data preparation
+stop_stage=100
+SECONDS=0
+
+ . utils/parse_options.sh || exit 1;
+
+# base url for downloads.
+data_url=https://us.openslr.org/resources/108/FR.tgz
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+mkdir -p ${MEDIASPEECH}
+if [ -z "${MEDIASPEECH}" ]; then
+    log "Fill the value of 'MEDIASPEECH' of db.sh"
+    exit 1
+fi
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log "data preparation started"
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then 
+    log "stage1: Download data to ${MEDIASPEECH}"
+    log "The default data of this recipe is from mediaspeech - french"
+    local/download_and_untar.sh ${MEDIASPEECH} ${data_url} FR.tgz
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage2: Preparing data for mediaspeech"
+    ### Task dependent. You have to make data the following preparation part by yourself.
+    mkdir -p data/{train_as,dev_as,test_as,validated_as}
+    python3 local/data_prep.py \
+        --data_path ${MEDIASPEECH}/FR \
+        --train_dir data/train_as \
+        --dev_dir data/dev_as \
+        --test_dir data/test_as \
+        --validated_dir data/validated_as \
+        --dev_ratio 0.1 \
+        --test_ratio 0.1 \
+        --validated_ratio 0.01
+    for x in train_as dev_as test_as validated_as; do
+        utils/utt2spk_to_spk2utt.pl data/${x}/utt2spk > data/${x}/spk2utt
+        utils/fix_data_dir.sh data/${x}
+        utils/validate_data_dir.sh --no-feats data/${x}
+    done
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/mediaspeech/asr1/local/data_prep.py b/egs2/mediaspeech/asr1/local/data_prep.py
new file mode 100755
index 00000000000..42162c53da0
--- /dev/null
+++ b/egs2/mediaspeech/asr1/local/data_prep.py
@@ -0,0 +1,104 @@
+import os
+import os.path
+import json
+import glob
+import math
+import argparse
+import random
+
+
+parser = argparse.ArgumentParser(description="Prepare mediaspeech")
+parser.add_argument(
+    "--data_path", type=str, help="Path to the directory containing all files"
+)
+parser.add_argument("--train_dir", type=str, help="Path to the train data")
+parser.add_argument("--dev_dir", type=str, help="Path to the dev data")
+parser.add_argument("--test_dir", type=str, help="Path to the test data")
+parser.add_argument("--validated_dir", type=str, help="Path to the validated data")
+parser.add_argument("--dev_ratio", type=float, default=0.136, help="Ratio of dev set")
+parser.add_argument("--test_ratio", type=float, default=0.136, help="Ratio of test set")
+parser.add_argument(
+    "--validated_ratio", type=float, default=0.136, help="Ratio of validated set"
+)
+args = parser.parse_args()
+
+file_names = [
+    name[:-5] for name in os.listdir(args.data_path) if name.endswith(".flac")
+]
+samp_cnt = len(file_names)
+dev_samp_cnt = int(samp_cnt * args.dev_ratio)
+test_samp_cnt = int(samp_cnt * args.test_ratio)
+validated_samp_cnt = int(samp_cnt * args.validated_ratio)
+train_samp_cnt = samp_cnt - dev_samp_cnt - test_samp_cnt - validated_samp_cnt
+
+print(
+    "samp_cnt, dev_samp_cnt, test_samp_cnt, validated_samp_cnt, train_samp_cnt: ",
+    samp_cnt,
+    dev_samp_cnt,
+    test_samp_cnt,
+    validated_samp_cnt,
+    train_samp_cnt,
+)
+print("file_names: ", file_names[:5])
+
+random.seed(2022)
+random.shuffle(file_names)
+# train_file_names = file_names[:train_samp_cnt]
+# dev_file_names = file_names[train_samp_cnt: train_samp_cnt + dev_samp_cnt]
+# test_file_names = file_names[train_samp_cnt + dev_samp_cnt:]
+
+train_samples = []
+dev_samples = []
+test_samples = []
+validated_samples = []
+
+for file_idx in range(samp_cnt):
+    file_name = file_names[file_idx]
+    text_file_path = os.path.join(args.data_path, file_name + ".txt")
+    aud_file_path = os.path.join(args.data_path, file_name + ".flac")
+
+    with open(text_file_path, "r") as f:
+        text = f.readlines()[0]
+
+    processed_sample = {
+        "user_id": 0,
+        "text": text,
+        "id": file_idx,
+        "abs_path": "ffmpeg -i %s -f wav -ar 16000 -ab 16 -ac 1 - |"
+        % os.path.abspath(aud_file_path),
+    }
+    if file_idx < train_samp_cnt:
+        train_samples.append(processed_sample)
+    elif file_idx < train_samp_cnt + dev_samp_cnt:
+        dev_samples.append(processed_sample)
+    elif file_idx < train_samp_cnt + dev_samp_cnt + validated_samp_cnt:
+        validated_samples.append(processed_sample)
+    else:
+        test_samples.append(processed_sample)
+
+for setname in ["train", "dev", "test", "validated"]:
+    if setname == "train":
+        sample_list = train_samples
+        dest_dir = args.train_dir
+    elif setname == "dev":
+        sample_list = dev_samples
+        dest_dir = args.dev_dir
+    elif setname == "test":
+        sample_list = test_samples
+        dest_dir = args.test_dir
+    elif setname == "validated":
+        sample_list = validated_samples
+        dest_dir = args.validated_dir
+    else:
+        raise RuntimeError
+
+    with open(os.path.join(dest_dir, "text"), "w") as text_f, open(
+        os.path.join(dest_dir, "wav.scp"), "w"
+    ) as wav_scp_f, open(os.path.join(dest_dir, "utt2spk"), "w") as utt2spk_f, open(
+        os.path.join(dest_dir, "utt2gender"), "w"
+    ) as utt2gndr_f:
+        for sample in sample_list:
+            text_f.write(f"{int(sample['id']):08d} {sample['text']}\n")
+            wav_scp_f.write(f"{int(sample['id']):08d} {sample['abs_path']}\n")
+            utt2spk_f.write(f"{int(sample['id']):08d} {int(sample['id']):08d}\n")
+            utt2gndr_f.write(f"{int(sample['id']):08d}" + " f\n")
diff --git a/egs2/mediaspeech/asr1/local/download_and_untar.sh b/egs2/mediaspeech/asr1/local/download_and_untar.sh
new file mode 120000
index 00000000000..794196074a4
--- /dev/null
+++ b/egs2/mediaspeech/asr1/local/download_and_untar.sh
@@ -0,0 +1 @@
+../../../../egs/commonvoice/asr1/local/download_and_untar.sh
\ No newline at end of file
diff --git a/egs2/mediaspeech/asr1/local/path.sh b/egs2/mediaspeech/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/mediaspeech/asr1/path.sh b/egs2/mediaspeech/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/mediaspeech/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/mediaspeech/asr1/pyscripts b/egs2/mediaspeech/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/mediaspeech/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/mediaspeech/asr1/run.sh b/egs2/mediaspeech/asr1/run.sh
new file mode 100755
index 00000000000..66cbd0da2ae
--- /dev/null
+++ b/egs2/mediaspeech/asr1/run.sh
@@ -0,0 +1,45 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+lang=fr
+
+train_set=train_"$(echo "${lang}" | tr - _)"
+train_dev=dev_"$(echo "${lang}" | tr - _)"
+test_set="${train_dev} test_$(echo ${lang} | tr - _)"
+
+asr_config=conf/tuning/train_asr_conformer5.yaml
+lm_config=conf/train_lm.yaml
+inference_config=conf/decode_asr.yaml
+
+if [[ "zh" == *"${lang}"* ]]; then
+  nbpe=2500
+elif [[ "fr" == *"${lang}"* ]]; then
+  nbpe=350
+elif [[ "es" == *"${lang}"* ]]; then
+  nbpe=235
+else
+  nbpe=150
+fi
+
+./asr.sh \
+    --ngpu 4 \
+    --lang "${lang}" \
+    --local_data_opts "--lang ${lang}" \
+    --use_lm true \
+    --lm_config "${lm_config}" \
+    --token_type bpe \
+    --nbpe $nbpe \
+    --feats_type raw \
+    --speed_perturb_factors "0.9 1.0 1.1" \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${train_dev}" \
+    --test_sets "${test_set}" \
+    --bpe_train_text "data/${train_set}/text" \
+    --lm_train_text "data/${train_set}/text" "$@"
+
diff --git a/egs2/mediaspeech/asr1/scripts b/egs2/mediaspeech/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/mediaspeech/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/mediaspeech/asr1/steps b/egs2/mediaspeech/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/mediaspeech/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/mediaspeech/asr1/utils b/egs2/mediaspeech/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/mediaspeech/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/microsoft_speech/asr1/README.md b/egs2/microsoft_speech/asr1/README.md
new file mode 100644
index 00000000000..4cd38303631
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/README.md
@@ -0,0 +1,96 @@
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+
+##  ASR for Microsoft Speech Corpus: 
+
+
+### Data citations
+``` 
+ @inproceedings{srivastava2018interspeech,
+  title={Interspeech 2018 Low Resource Automatic Speech Recognition Challenge for Indian Languages.},
+  author={Srivastava, Brij Mohan Lal and Sitaram, Sunayana and Mehta, Rupesh Kumar and Mohan, Krishna Doss and Matani, Pallavi and Satpal, Sandeepkumar and Bali, Kalika and Srikanth, Radhakrishnan and Nayak, Niranjan},
+  booktitle={SLTU},
+  pages={11--14},
+  year={2018}
+ }
+```
+
+
+We provide in this recipe configurations to train on the above data by Microsoft on Telegu language which contains 40 hours of data.
+
+
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+# RESULTS
+## Environments
+- date: `Tue Mar 22 20:03:41 EDT 2022`
+- python version: `3.9.7 (default, Sep 16 2021, 13:09:58)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.10.1`
+- Git hash: `b274c4ea66c59600599a4a340296bb15412b3a9c`
+  - Commit date: `Wed Mar 2 17:11:19 2022 -0500`
+
+## asr_train_asr_conformer5_hubert_raw_te_bpe200_sp
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe200_valid.loss.ave_ngram_ngram_3gram_asr_model_valid.acc.ave/dev_te|3382|18352|75.8|21.5|2.7|3.2|27.3|61.7|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe200_valid.loss.ave_ngram_ngram_3gram_asr_model_valid.acc.ave/test_te|3040|28413|79.5|18.5|2.1|2.7|23.2|78.1|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe200_valid.loss.ave_ngram_ngram_3gram_asr_model_valid.acc.ave/dev_te|3382|136156|94.8|2.7|2.5|2.1|7.3|61.7|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe200_valid.loss.ave_ngram_ngram_3gram_asr_model_valid.acc.ave/test_te|3040|229419|96.1|1.9|1.9|1.7|5.6|78.1|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe200_valid.loss.ave_ngram_ngram_3gram_asr_model_valid.acc.ave/dev_te|3382|77163|89.8|6.5|3.7|2.0|12.3|61.7|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe200_valid.loss.ave_ngram_ngram_3gram_asr_model_valid.acc.ave/test_te|3040|133033|92.6|4.8|2.6|1.7|9.1|78.1|
+
+## asr_train_asr_conformer_wav2vec2_raw_te_bpe200_sp
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe200_valid.loss.ave_ngram_ngram_3gram_asr_model_valid.acc.ave/dev_te|3382|18352|75.6|21.6|2.8|2.8|27.3|60.4|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe200_valid.loss.ave_ngram_ngram_3gram_asr_model_valid.acc.ave/test_te|3040|28413|78.5|19.2|2.2|2.5|24.0|78.6|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe200_valid.loss.ave_ngram_ngram_3gram_asr_model_valid.acc.ave/dev_te|3382|136156|94.9|2.6|2.5|2.0|7.1|60.4|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe200_valid.loss.ave_ngram_ngram_3gram_asr_model_valid.acc.ave/test_te|3040|229419|95.9|2.0|2.1|1.6|5.7|78.6|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe200_valid.loss.ave_ngram_ngram_3gram_asr_model_valid.acc.ave/dev_te|3382|77163|90.0|6.4|3.6|2.0|11.9|60.4|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe200_valid.loss.ave_ngram_ngram_3gram_asr_model_valid.acc.ave/test_te|3040|133033|92.2|5.0|2.8|1.7|9.4|78.6|
+
+
+## asr_train_asr_conformer5_raw_multilingual_bpe400_sp 
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer_lm_lm_train_lm_transformer_mu_bpe400_valid.loss.ave_ngram_ngram_3gram_asr_model_valid.acc.ave/dev_te|3382|18352|77.3|20.1|2.5|2.9|25.5|56.6|
+|decode_transformer_lm_lm_train_lm_transformer_mu_bpe400_valid.loss.ave_ngram_ngram_3gram_asr_model_valid.acc.ave/test_te|3040|28413|79.4|18.4|2.2|2.5|23.2|78.0|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer_lm_lm_train_lm_transformer_mu_bpe400_valid.loss.ave_ngram_ngram_3gram_asr_model_valid.acc.ave/dev_te|3382|136156|95.3|2.4|2.2|1.9|6.6|56.6|
+|decode_transformer_lm_lm_train_lm_transformer_mu_bpe400_valid.loss.ave_ngram_ngram_3gram_asr_model_valid.acc.ave/test_te|3040|229419|96.0|2.0|2.0|1.6|5.6|78.0|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer_lm_lm_train_lm_transformer_mu_bpe400_valid.loss.ave_ngram_ngram_3gram_asr_model_valid.acc.ave/dev_te|3382|76064|90.7|6.1|3.2|1.9|11.2|56.6|
+|decode_transformer_lm_lm_train_lm_transformer_mu_bpe400_valid.loss.ave_ngram_ngram_3gram_asr_model_valid.acc.ave/test_te|3040|132019|92.5|4.9|2.6|1.7|9.1|78.0|
diff --git a/egs2/microsoft_speech/asr1/__MACOSX/._local b/egs2/microsoft_speech/asr1/__MACOSX/._local
new file mode 100755
index 00000000000..1781235fa80
Binary files /dev/null and b/egs2/microsoft_speech/asr1/__MACOSX/._local differ
diff --git a/egs2/microsoft_speech/asr1/asr.sh b/egs2/microsoft_speech/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/microsoft_speech/asr1/cmd.sh b/egs2/microsoft_speech/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/microsoft_speech/asr1/conf/decode_asr.yaml b/egs2/microsoft_speech/asr1/conf/decode_asr.yaml
new file mode 120000
index 00000000000..a241adeae2e
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/conf/decode_asr.yaml
@@ -0,0 +1 @@
+./tuning/decode_transformer.yaml
\ No newline at end of file
diff --git a/egs2/microsoft_speech/asr1/conf/fbank.conf b/egs2/microsoft_speech/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/microsoft_speech/asr1/conf/pbs.conf b/egs2/microsoft_speech/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/microsoft_speech/asr1/conf/pitch.conf b/egs2/microsoft_speech/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/microsoft_speech/asr1/conf/queue.conf b/egs2/microsoft_speech/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/microsoft_speech/asr1/conf/slurm.conf b/egs2/microsoft_speech/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/microsoft_speech/asr1/conf/train_asr.yaml b/egs2/microsoft_speech/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..0ebce97bf61
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+./tuning/train_asr_conformer5_hubert.yaml
\ No newline at end of file
diff --git a/egs2/microsoft_speech/asr1/conf/train_lm.yaml b/egs2/microsoft_speech/asr1/conf/train_lm.yaml
new file mode 120000
index 00000000000..57031eb88ad
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/conf/train_lm.yaml
@@ -0,0 +1 @@
+./tuning/train_lm_transformer.yaml
\ No newline at end of file
diff --git a/egs2/microsoft_speech/asr1/conf/tuning/decode_transformer.yaml b/egs2/microsoft_speech/asr1/conf/tuning/decode_transformer.yaml
new file mode 100644
index 00000000000..9c03d913dad
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/conf/tuning/decode_transformer.yaml
@@ -0,0 +1,7 @@
+batch_size: 1
+beam_size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.5
+lm_weight: 0.3
\ No newline at end of file
diff --git a/egs2/microsoft_speech/asr1/conf/tuning/train_asr_conformer5.yaml b/egs2/microsoft_speech/asr1/conf/tuning/train_asr_conformer5.yaml
new file mode 100644
index 00000000000..982f9a5b994
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/conf/tuning/train_asr_conformer5.yaml
@@ -0,0 +1,84 @@
+cudnn_benchmark: false 
+cudnn_deterministic: false 
+use_amp: true 
+cudnn_benchmark: false 
+cudnn_deterministic: false 
+use_amp: true 
+# network architecture
+
+# frontend related
+frontend: default
+frontend_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 160
+
+# encoder related
+encoder: conformer
+encoder_conf:
+    input_layer: conv2d
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    macaron_style: true
+    use_cnn_module: true
+    cnn_module_kernel: 15
+
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: embed
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# optimization related
+optim: adam
+accum_grad: 1
+grad_clip: 3
+max_epoch: 50
+optim_conf:
+    lr: 4.0
+scheduler: noamlr
+scheduler_conf:
+    model_size: 256
+    warmup_steps: 25000
+
+# minibatch related
+batch_type: numel
+batch_bins: 2500000
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/microsoft_speech/asr1/conf/tuning/train_asr_conformer5_hubert.yaml b/egs2/microsoft_speech/asr1/conf/tuning/train_asr_conformer5_hubert.yaml
new file mode 100644
index 00000000000..312e30e6e60
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/conf/tuning/train_asr_conformer5_hubert.yaml
@@ -0,0 +1,90 @@
+batch_type: numel
+batch_bins: 2000000
+accum_grad: 4
+max_epoch: 40
+patience: 5
+# The initialization method for model parameters
+init: xavier_uniform
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 5
+
+freeze_param: [
+"frontend.upstream"
+]
+
+frontend: s3prl
+frontend_conf:
+    frontend_conf:
+        upstream: hubert_large_ll60k  # Note: If the upstream is changed, please change the input_size in the preencoder.
+    download_dir: ./hub
+    multilayer_feature: true
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 80
+
+encoder: conformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d2
+    normalize_before: true
+    macaron_style: true
+    rel_pos_type: latest
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module: true
+    cnn_module_kernel: 15
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# ctc related
+ctc_conf:
+    ignore_nan_grad: true
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+optim: adam
+optim_conf:
+    lr: 0.005
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 30000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
\ No newline at end of file
diff --git a/egs2/microsoft_speech/asr1/conf/tuning/train_asr_conformer_wav2vec2.yaml b/egs2/microsoft_speech/asr1/conf/tuning/train_asr_conformer_wav2vec2.yaml
new file mode 100644
index 00000000000..21a13a60edf
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/conf/tuning/train_asr_conformer_wav2vec2.yaml
@@ -0,0 +1,91 @@
+# network architecture
+# encoder related
+batch_type: numel
+batch_bins: 2000000
+encoder: conformer
+accum_grad: 2
+
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d2
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+optim: adam
+optim_conf:
+    lr: 0.002
+    weight_decay: 1.0e-06
+scheduler: warmuplr     # pytorch v1.1.0+ required #Tune warmup steps
+scheduler_conf:
+    warmup_steps: 5000
+max_epoch: 50
+patience: 5
+
+freeze_param: [
+"frontend.upstream"
+]
+
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+frontend: s3prl
+frontend_conf:
+    frontend_conf:
+        upstream: wav2vec2_large_ll60k  # Note: If the upstream is changed, please change the input_size in the preencoder.
+    download_dir: ./hub
+    multilayer_feature: True
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 80
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
diff --git a/egs2/microsoft_speech/asr1/conf/tuning/train_lm_transformer.yaml b/egs2/microsoft_speech/asr1/conf/tuning/train_lm_transformer.yaml
new file mode 100644
index 00000000000..f7de29fd49f
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/conf/tuning/train_lm_transformer.yaml
@@ -0,0 +1,30 @@
+# Trained with Nvidia TESLA V100, with 16GM RAM, x4
+lm: transformer
+lm_conf:
+    pos_enc: null
+    embed_unit: 128
+    att_unit: 512
+    head: 8
+    unit: 2048
+    layer: 16
+    dropout_rate: 0.1
+
+# optimization related
+grad_clip: 5.0
+batch_type: numel
+batch_bins: 350000
+accum_grad: 2
+max_epoch: 25
+
+optim: adam
+optim_conf:
+   lr: 0.001
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 25000
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 10
diff --git a/egs2/microsoft_speech/asr1/db.sh b/egs2/microsoft_speech/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/microsoft_speech/asr1/local/data.sh b/egs2/microsoft_speech/asr1/local/data.sh
new file mode 100644
index 00000000000..073cad9c827
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/local/data.sh
@@ -0,0 +1,66 @@
+#!/usr/bin/env bash
+
+# Copyright 2020 Johns Hopkins University (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+# general configuration
+stage=0       # start from 0 if you need to start from data preparation
+stop_stage=100
+SECONDS=0
+lang=te  # te, ta
+ . utils/parse_options.sh || exit 1;
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+
+if [ -z "${MICROSOFT_SPEECH_CORPUS}" ]; then
+    log "Fill the value of 'MICROSOFT_SPEECH_CORPUS' of db.sh"
+    exit 1
+fi
+
+
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+
+log "data preparation started"
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then 
+    log "stage1: Download data to ${MICROSOFT_SPEECH_CORPUS}"
+    log "Download data from the link:  https://msropendata.com/datasets/7230b4b1-912d-400e-be58-f84e0512985e"
+    log "checking if the right directory structure exists"
+
+    if [ -d "${MICROSOFT_SPEECH_CORPUS}/${lang}-in-Train/Audios" ]
+    then 
+        echo "Data directory exists."
+    else 
+        echo "Error: Directory ${MICROSOFT_SPEECH_CORPUS}/${lang}-in-Train/Audios does not exists."
+    fi
+fi
+
+
+mkdir -p data
+mkdir -p data/dev_${lang}
+mkdir -p data/test_${lang}
+mkdir -p data/train_${lang}
+
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage2: Preparing data for microsoft_speech_corpus"
+    python local/process.py ${MICROSOFT_SPEECH_CORPUS} ${lang}
+    ### Running python script for preparing data in Kaldi style from Microsoft speech corpus   
+
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/microsoft_speech/asr1/local/path.sh b/egs2/microsoft_speech/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/microsoft_speech/asr1/local/process.py b/egs2/microsoft_speech/asr1/local/process.py
new file mode 100644
index 00000000000..eab7b85de9a
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/local/process.py
@@ -0,0 +1,155 @@
+import os
+import wave
+import contextlib
+from tqdm import tqdm
+import random
+import sys
+
+
+microsoft_speech_corpus_path = sys.argv[1]
+lang = sys.argv[2]
+
+train_folder = f"{microsoft_speech_corpus_path}/{lang}-in-Train"
+test_folder = f"{microsoft_speech_corpus_path}/{lang}-in-Test"
+
+train_audio_folder = os.path.join(train_folder, "Audios")
+dev_audio_folder = train_audio_folder
+test_audio_folder = os.path.join(test_folder, "Audios")
+train_tr_file = os.path.join(train_folder, "transcription.txt")
+test_tr_file = os.path.join(test_folder, "transcription.txt")
+
+train_dst_folder = f"data/train_{lang}"
+dev_dst_folder = f"data/dev_{lang}"
+test_dst_folder = f"data/test_{lang}"
+
+utt_idx = 1
+
+
+def get_duration(fname):
+    with contextlib.closing(wave.open(fname, "r")) as f:
+        frames = f.getnframes()
+        rate = f.getframerate()
+        duration = frames / float(rate)
+        return duration
+
+
+def get_dev_split():
+    train_audio_files = os.listdir(train_audio_folder)
+    dev_split = set()
+    # randomly split the training set of 40 hours to 37 train and 3 dev
+    dur = 7200
+    while dur >= 0:
+        f = random.choice(train_audio_files)
+        while f in dev_split:
+            f = random.choice(train_audio_files)
+        wav_file = os.path.join(train_audio_folder, f)
+        dev_split.add(f)
+        dur -= get_duration(wav_file)
+    return dev_split
+
+
+def get_train_dev_trs():
+    dev_split = get_dev_split()
+    train_transcriptions = []
+    dev_transcriptions = []
+    train_fnames = []
+    dev_fnames = []
+    train_utts = []
+    dev_utts = []
+    with open(train_tr_file, encoding="utf-8") as f:
+        for line in f:
+            try:
+                line = line.strip()
+                fname, text = line.split("\t")
+                global utt_idx
+                utt_id = f"id_{utt_idx:07d}"
+                line = utt_id + " " + text
+                fpath = fname + ".wav"
+                if fpath in dev_split:
+                    dev_fnames.append(fname)
+                    dev_utts.append(utt_id)
+                    dev_transcriptions.append(line)
+                else:
+                    train_fnames.append(fname)
+                    train_utts.append(utt_id)
+                    train_transcriptions.append(line)
+                utt_idx += 1
+            except Exception as e:
+                print(f"Cannot process {line}")
+    return (
+        train_transcriptions,
+        train_fnames,
+        train_utts,
+        dev_transcriptions,
+        dev_fnames,
+        dev_utts,
+    )
+
+
+def get_test_trs():
+    test_transcriptions = []
+    test_fnames = []
+    test_utts = []
+    with open(test_tr_file, encoding="utf-8") as f:
+        for line in f:
+            try:
+                line = line.strip()
+                fname, text = line.split("\t")
+                global utt_idx
+                utt_id = f"id_{utt_idx:07d}"
+                line = utt_id + " " + text
+                test_fnames.append(fname)
+                test_utts.append(utt_id)
+                test_transcriptions.append(line)
+                utt_idx += 1
+            except Exception as e:
+                print(f"Cannot process {line}")
+    return test_transcriptions, test_fnames, test_utts
+
+
+def prepare_files(dest_folder, audio_folder, transcriptions, fnames, utt_ids):
+    with open(os.path.join(dest_folder, "text"), "w", encoding="utf-8") as f:
+        for line in transcriptions:
+            f.write(line)
+            f.write("\n")
+    with open(os.path.join(dest_folder, "spk2utt"), "w", encoding="utf-8") as f:
+        for idx in utt_ids:
+            line = idx + " " + idx
+            f.write(line)
+            f.write("\n")
+    with open(os.path.join(dest_folder, "utt2spk"), "w", encoding="utf-8") as f:
+        for idx in utt_ids:
+            line = idx + " " + idx
+            f.write(line)
+            f.write("\n")
+    with open(os.path.join(dest_folder, "utt2gender"), "w", encoding="utf-8") as f:
+        for idx in utt_ids:
+            line = idx + " " + "m"
+            f.write(line)
+            f.write("\n")
+    with open(os.path.join(dest_folder, "wav.scp"), "w", encoding="utf-8") as f:
+        for (idx, fname) in zip(utt_ids, fnames):
+            fpath = os.path.join(audio_folder, fname + ".wav")
+            line = idx + " " + fpath
+            f.write(line)
+            f.write("\n")
+
+
+(
+    train_transcriptions,
+    train_fnames,
+    train_utts,
+    dev_transcriptions,
+    dev_fnames,
+    dev_utts,
+) = get_train_dev_trs()
+test_transcriptions, test_fnames, test_utts = get_test_trs()
+prepare_files(
+    train_dst_folder, train_audio_folder, train_transcriptions, train_fnames, train_utts
+)
+prepare_files(
+    dev_dst_folder, dev_audio_folder, dev_transcriptions, dev_fnames, dev_utts
+)
+prepare_files(
+    test_dst_folder, test_audio_folder, test_transcriptions, test_fnames, test_utts
+)
diff --git a/egs2/microsoft_speech/asr1/path.sh b/egs2/microsoft_speech/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/microsoft_speech/asr1/pyscripts b/egs2/microsoft_speech/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/microsoft_speech/asr1/run.sh b/egs2/microsoft_speech/asr1/run.sh
new file mode 100755
index 00000000000..8b403b31a82
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/run.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+lang=te #te, ta
+
+train_set=train_"$(echo "${lang}" | tr - _)"
+train_dev=dev_"$(echo "${lang}" | tr - _)"
+test_set="${train_dev} test_$(echo ${lang} | tr - _)"
+
+asr_config=conf/train_asr.yaml
+lm_config=conf/train_lm.yaml
+inference_config=conf/decode_asr.yaml
+
+nbpe=200
+
+
+./asr.sh \
+    --ngpu 4 \
+    --lang "${lang}" \
+    --local_data_opts "--lang ${lang}" \
+    --use_lm true \
+    --lm_config "${lm_config}" \
+    --token_type bpe \
+    --nbpe $nbpe \
+    --feats_type raw \
+    --speed_perturb_factors "0.9 1.0 1.1" \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${train_dev}" \
+    --test_sets "${test_set}" \
+    --bpe_train_text "data/${train_set}/text" \
+    --lm_train_text "data/${train_set}/text" "$@"
+
diff --git a/egs2/microsoft_speech/asr1/scripts b/egs2/microsoft_speech/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/microsoft_speech/asr1/steps b/egs2/microsoft_speech/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/microsoft_speech/asr1/utils b/egs2/microsoft_speech/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/microsoft_speech/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/mini_an4/asr1/transfer_learning.md b/egs2/mini_an4/asr1/transfer_learning.md
new file mode 100644
index 00000000000..cefb53835a3
--- /dev/null
+++ b/egs2/mini_an4/asr1/transfer_learning.md
@@ -0,0 +1,41 @@
+## Use transfer learning for ASR in ESPnet2
+
+In that tutorial, we will introduce several options to use pre-trained models/parameters for Automatic Speech Recognition (ASR) in ESPnet. Available options are : 
+- use a local model you (or a collegue) have already trained,
+- use a trained model from ESPnet repository on HuggingFace.
+
+We note that this is done for ASR training, so at __stage 11__ of ESPnet2 models' recipe.
+
+### 0. Why using such (pre-)trained models ? 
+
+Several projects may involve making use of previously trained models, this is the reason why we developed ESPnet repository on HuggingFace for instance.
+Example of use cases are listed below (non-exhaustive):
+- target a low resource language, a model trained from scratch may perform badly if trained with only few hours of data,
+- study robustness to shifts (domain, language ... shifts) of a model,
+- use massively trained multilingual ASR models ...
+
+### 1. Use a local model that you have already trained. 
+
+__Step 1__ : make sure your ASR model file has the proper ESPnet format (should be ok if trained with ESPnet). It just needs to be a ".pth" (or ".pt" or other extension) type pytorch model.
+
+__Step 2__ : add the parameter ```--pretrained_model path/to/your/pretrained/model/file.pth``` to run.sh. 
+
+__Step 3__ : step 2 will initialize your new model with the parameters of the pre-trained model. Thus your new model will be trained with a strong initialization. However, if your new model has different parameter sizes for some parts of the model (e.g. last projection layer could be modified ...). This will lead to an error because of mismatches in size. To prevent this to happen, you can add the parameter ```--ignore_init_mismatch true``` in run.sh.
+
+__Step 4 (Optional)__ : if you only want to use some specific parts of the pre-trained model, or exclude specific parts, you can specify it in the ```--pretrained_model``` argument by passing the component names with the following syntax : ```--pretrained_model <file_path>:<src_key>:<dst_key>:<exclude_Keys>```. ```src_key``` are the parameters you want to keep from the pre-trained model. ```dst_key``` are the parameters you want to initialize in the new model with the ```src_key```parameters. And ```exclude_Keys``` are the parameters from the pre-trained model that you do not want to use. You can leave ```src_key``` and ```dst_key``` fields empty and just fill ```exclude_Keys``` with the parameters that you want to drop. For instance, if you want to re-use encoder parameters but not decoder ones, syntax will be ```--pretrained_model <file_path>:::decoder```.  You can see the argument expected format in more details [here](https://github.com/espnet/espnet/blob/e76c78c0c661ab37cc081d46d9b059dcb31292fe/espnet2/torch_utils/load_pretrained_model.py#L43-L53).
+
+__Additional note about the ```--ignore_init_mismatch true``` option :__ This option is very convenient because in lots of transfer learning use cases, you will aim to use a model trained on a language X (e.g. X=English) for another language Y. Language Y may have a vocabulary (set of tokens) different from language X, for instance if you target Y=Totonac, a Mexican low resource language, your model may be stronger if you use a different set of bpes/tokens that the one used to train the English model. In that situation, the last layer (projection to vocabulary space) of your ASR model needs to be initialized from scratch and may be different in shape than the one of the English model. For that reason, you should use the ```--ignore_init_mismatch true``` option. It also enables to handle the case where the scripts are differents from languages X to Y.
+
+
+### 2. Use a trained model from ESPnet repository on HuggingFace.
+
+[ESPnet repository on HuggingFace](https://huggingface.co/espnet) contains more than 200 pre-trained models, for a wide variety of languages and dataset, and we are actively expanding this repositories with new models every week! This enables any user to perform transfer learning with a wide variety of models without having to re-train them. 
+In order to use our pre-trained models, the first step is to download the ".pth" model file from the [HugginFace page](https://huggingface.co/espnet). There are several easy ways to do it, either by manually downloading them (e.g. ```wget https://huggingface.co/espnet/bn_openslr53/blob/main/exp/asr_train_asr_raw_bpe1000/41epoch.pth```), cloning it (```git clone https://huggingface.co/espnet/bn_openslr53```) or downloading it through an ESPnet recipe (described in the models' pages on HuggingFace): 
+```cd espnet
+git checkout fa1b865352475b744c37f70440de1cc6b257ba70
+pip install -e .
+cd egs2/bn_openslr53/asr1
+./run.sh --skip_data_prep false --skip_train true --download_model espnet/bn_openslr53
+```
+
+Then, as you have the ".pth" model file, you can follow the steps 1 to 4 from the previous section in order to use this pre-train model.
diff --git a/egs2/mini_an4/enh_asr1/cmd.sh b/egs2/mini_an4/enh_asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/mini_an4/enh_asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/mini_an4/enh_asr1/conf/fbank.conf b/egs2/mini_an4/enh_asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/mini_an4/enh_asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/mini_an4/enh_asr1/conf/pbs.conf b/egs2/mini_an4/enh_asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/mini_an4/enh_asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/mini_an4/enh_asr1/conf/pitch.conf b/egs2/mini_an4/enh_asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/mini_an4/enh_asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/mini_an4/enh_asr1/conf/queue.conf b/egs2/mini_an4/enh_asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/mini_an4/enh_asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/mini_an4/enh_asr1/conf/slurm.conf b/egs2/mini_an4/enh_asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/mini_an4/enh_asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/mini_an4/enh_asr1/db.sh b/egs2/mini_an4/enh_asr1/db.sh
new file mode 120000
index 00000000000..3090b1bc350
--- /dev/null
+++ b/egs2/mini_an4/enh_asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/enh_asr1/db.sh
\ No newline at end of file
diff --git a/egs2/mini_an4/enh_asr1/downloads.tar.gz b/egs2/mini_an4/enh_asr1/downloads.tar.gz
new file mode 120000
index 00000000000..e916a05f1df
--- /dev/null
+++ b/egs2/mini_an4/enh_asr1/downloads.tar.gz
@@ -0,0 +1 @@
+../../../egs/mini_an4/asr1/downloads.tar.gz
\ No newline at end of file
diff --git a/egs2/mini_an4/enh_asr1/enh_asr.sh b/egs2/mini_an4/enh_asr1/enh_asr.sh
new file mode 120000
index 00000000000..b00d9b13ef7
--- /dev/null
+++ b/egs2/mini_an4/enh_asr1/enh_asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/enh_asr1/enh_asr.sh
\ No newline at end of file
diff --git a/egs2/mini_an4/enh_asr1/local/data.sh b/egs2/mini_an4/enh_asr1/local/data.sh
new file mode 100755
index 00000000000..4b35a1668e0
--- /dev/null
+++ b/egs2/mini_an4/enh_asr1/local/data.sh
@@ -0,0 +1,94 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+stage=1
+stop_stage=100
+
+an4_root=./downloads/an4
+
+log "$0 $*"
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+. ./path.sh
+. ./cmd.sh
+
+train_set="train_nodev"
+train_dev="train_dev"
+
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: Untar downloads.tar.gz"
+    if [ ! -e downloads/ ]; then
+        tar -xvf downloads.tar.gz
+    fi
+fi
+
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: Data preparation"
+    mkdir -p data/{train,test}
+
+    if [ ! -f ${an4_root}/README ]; then
+        echo Cannot find an4 root! Exiting...
+        exit 1
+    fi
+
+    python3 local/data_prep.py ${an4_root} sph2pipe
+
+    for x in test train; do
+        for f in text wav.scp utt2spk; do
+            sort data/${x}/${f} -o data/${x}/${f}
+        done
+        utils/utt2spk_to_spk2utt.pl data/${x}/utt2spk > data/${x}/spk2utt
+    done
+
+    # make a dev set
+    utils/subset_data_dir.sh --first data/train 1 data/${train_dev}
+    n=$(($(wc -l < data/train/text) - 1))
+    utils/subset_data_dir.sh --last data/train ${n} data/${train_set}
+
+    # Create "test_seg" in order to test the use case of segments
+    rm -rf data/test_seg
+    utils/copy_data_dir.sh data/test data/test_seg
+    <data/test/wav.scp awk '{ for(i=2;i<=NF;i++){a=a " " $i}; print($1 "_org", a) }' > data/test_seg/wav.scp
+    cat << EOF > data/test_seg/segments
+fcaw-cen8-b fcaw-cen8-b_org 0.0 2.9
+mmxg-cen8-b mmxg-cen8-b_org 0.0 2.3
+EOF
+
+    # for enh task
+    for x in test ${train_set} ${train_dev} test test_seg; do
+        for f in wav.scp text utt2spk segments; do
+            [ ! -f data/${x}/${f} ] && continue
+            mv data/${x}/${f} data/${x}/${f}.old
+            if [ $f = "segments" ]; then
+                <data/${x}/${f}.old awk '{$1=$1"_SIMU"; $2=$2"_SIMU"; print($0)}' > data/${x}/${f}
+            else
+                <data/${x}/${f}.old awk '{$1=$1"_SIMU"; print($0)}' > data/${x}/${f}
+            fi
+            rm data/${x}/${f}.old
+        done
+        utils/utt2spk_to_spk2utt.pl data/${x}/utt2spk > data/${x}/spk2utt
+
+        cp data/${x}/wav.scp data/${x}/spk1.scp
+        <data/${x}/wav.scp awk '{print($1, "SIMU")}' > data/${x}/utt2category
+    done
+fi
+
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/mini_an4/enh_asr1/local/data_prep.py b/egs2/mini_an4/enh_asr1/local/data_prep.py
new file mode 120000
index 00000000000..d416349ede8
--- /dev/null
+++ b/egs2/mini_an4/enh_asr1/local/data_prep.py
@@ -0,0 +1 @@
+../../../../egs/an4/asr1/local/data_prep.py
\ No newline at end of file
diff --git a/egs2/mini_an4/enh_asr1/local/download_and_untar.sh b/egs2/mini_an4/enh_asr1/local/download_and_untar.sh
new file mode 120000
index 00000000000..40bf437ab02
--- /dev/null
+++ b/egs2/mini_an4/enh_asr1/local/download_and_untar.sh
@@ -0,0 +1 @@
+../../../../egs/an4/asr1/local/download_and_untar.sh
\ No newline at end of file
diff --git a/egs2/mini_an4/enh_asr1/local/path.sh b/egs2/mini_an4/enh_asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/mini_an4/enh_asr1/path.sh b/egs2/mini_an4/enh_asr1/path.sh
new file mode 120000
index 00000000000..f2720c6899b
--- /dev/null
+++ b/egs2/mini_an4/enh_asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/enh_asr1/path.sh
\ No newline at end of file
diff --git a/egs2/mini_an4/enh_asr1/pyscripts b/egs2/mini_an4/enh_asr1/pyscripts
new file mode 120000
index 00000000000..008f9bd4bc5
--- /dev/null
+++ b/egs2/mini_an4/enh_asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/enh_asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/mini_an4/enh_asr1/run.sh b/egs2/mini_an4/enh_asr1/run.sh
new file mode 100755
index 00000000000..7b01cd09b87
--- /dev/null
+++ b/egs2/mini_an4/enh_asr1/run.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+./enh_asr.sh \
+    --lang en \
+    --train_set train_nodev \
+    --valid_set train_dev \
+    --test_sets "train_dev test test_seg" \
+    --lm_train_text "data/train_nodev/text" "$@"
diff --git a/egs2/mini_an4/enh_asr1/scripts b/egs2/mini_an4/enh_asr1/scripts
new file mode 120000
index 00000000000..6c0f28ef23c
--- /dev/null
+++ b/egs2/mini_an4/enh_asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/enh_asr1/scripts
\ No newline at end of file
diff --git a/egs2/mini_an4/enh_asr1/steps b/egs2/mini_an4/enh_asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/mini_an4/enh_asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/mini_an4/enh_asr1/utils b/egs2/mini_an4/enh_asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/mini_an4/enh_asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/mini_an4/st1/cmd.sh b/egs2/mini_an4/st1/cmd.sh
new file mode 100755
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/mini_an4/st1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/mini_an4/st1/conf/fbank.conf b/egs2/mini_an4/st1/conf/fbank.conf
new file mode 100755
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/mini_an4/st1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/mini_an4/st1/conf/pbs.conf b/egs2/mini_an4/st1/conf/pbs.conf
new file mode 100755
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/mini_an4/st1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/mini_an4/st1/conf/pitch.conf b/egs2/mini_an4/st1/conf/pitch.conf
new file mode 100755
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/mini_an4/st1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/mini_an4/st1/conf/queue.conf b/egs2/mini_an4/st1/conf/queue.conf
new file mode 100755
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/mini_an4/st1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/mini_an4/st1/conf/slurm.conf b/egs2/mini_an4/st1/conf/slurm.conf
new file mode 100755
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/mini_an4/st1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/mini_an4/st1/conf/train_st.yaml b/egs2/mini_an4/st1/conf/train_st.yaml
new file mode 100644
index 00000000000..4e2d9531fd8
--- /dev/null
+++ b/egs2/mini_an4/st1/conf/train_st.yaml
@@ -0,0 +1,6 @@
+model_conf:
+    asr_weight: 0.3
+    mt_weight: 0.0
+    mtlalpha: 1.0
+    lsm_weight: 0.1
+    length_normalized_loss: false
diff --git a/egs2/mini_an4/st1/conf/train_st_streaming.yaml b/egs2/mini_an4/st1/conf/train_st_streaming.yaml
new file mode 100644
index 00000000000..878d1d8f1a9
--- /dev/null
+++ b/egs2/mini_an4/st1/conf/train_st_streaming.yaml
@@ -0,0 +1,9 @@
+model_conf:
+    asr_weight: 0.3
+    mt_weight: 0.0
+    mtlalpha: 1.0
+    lsm_weight: 0.1
+    length_normalized_loss: false
+encoder: contextual_block_transformer
+decoder: transformer
+max_epoch: 5
diff --git a/egs2/mini_an4/st1/db.sh b/egs2/mini_an4/st1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/mini_an4/st1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/mini_an4/st1/downloads.tar.gz b/egs2/mini_an4/st1/downloads.tar.gz
new file mode 120000
index 00000000000..e916a05f1df
--- /dev/null
+++ b/egs2/mini_an4/st1/downloads.tar.gz
@@ -0,0 +1 @@
+../../../egs/mini_an4/asr1/downloads.tar.gz
\ No newline at end of file
diff --git a/egs2/mini_an4/st1/local/data.sh b/egs2/mini_an4/st1/local/data.sh
new file mode 100755
index 00000000000..030b7c30df5
--- /dev/null
+++ b/egs2/mini_an4/st1/local/data.sh
@@ -0,0 +1,85 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+stage=1
+stop_stage=100
+
+an4_root=./downloads/an4
+
+log "$0 $*"
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+. ./path.sh
+. ./cmd.sh
+
+train_set="train_nodev"
+train_dev="train_dev"
+
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: Untar downloads.tar.gz"
+    if [ ! -e downloads/ ]; then
+        tar -xvf downloads.tar.gz
+    fi
+fi
+
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: Data preparation"
+    mkdir -p data/{train,test}
+
+    if [ ! -f ${an4_root}/README ]; then
+        echo Cannot find an4 root! Exiting...
+        exit 1
+    fi
+
+    python3 local/data_prep.py ${an4_root} sph2pipe
+
+    for x in test train; do
+        for f in text wav.scp utt2spk; do
+            sort data/${x}/${f} -o data/${x}/${f}
+        done
+        utils/utt2spk_to_spk2utt.pl data/${x}/utt2spk > data/${x}/spk2utt
+    done
+
+    # make a dev set
+    utils/subset_data_dir.sh --first data/train 1 data/${train_dev}
+    n=$(($(wc -l < data/train/text) - 1))
+    utils/subset_data_dir.sh --last data/train ${n} data/${train_set}
+
+    # Create "test_seg" in order to test the use case of segments
+    rm -rf data/test_seg
+    utils/copy_data_dir.sh data/test data/test_seg
+    <data/test/wav.scp awk '{ for(i=2;i<=NF;i++){a=a " " $i}; print($1 "_org", a) }' > data/test_seg/wav.scp
+    cat << EOF > data/test_seg/segments
+fcaw-cen8-b fcaw-cen8-b_org 0.0 2.9
+mmxg-cen8-b mmxg-cen8-b_org 0.0 2.3
+EOF
+
+    # for enh task
+    for x in test ${train_set} ${train_dev}; do
+        cp data/${x}/wav.scp data/${x}/spk1.scp
+    done
+fi
+
+for x in test test_seg ${train_set} ${train_dev}; do
+	cp data/${x}/text data/${x}/text.lc.rm.en
+	cp data/${x}/text data/${x}/text.tc.en
+done
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/mini_an4/st1/local/data_prep.py b/egs2/mini_an4/st1/local/data_prep.py
new file mode 120000
index 00000000000..d416349ede8
--- /dev/null
+++ b/egs2/mini_an4/st1/local/data_prep.py
@@ -0,0 +1 @@
+../../../../egs/an4/asr1/local/data_prep.py
\ No newline at end of file
diff --git a/egs2/mini_an4/st1/local/download_and_untar.sh b/egs2/mini_an4/st1/local/download_and_untar.sh
new file mode 120000
index 00000000000..40bf437ab02
--- /dev/null
+++ b/egs2/mini_an4/st1/local/download_and_untar.sh
@@ -0,0 +1 @@
+../../../../egs/an4/asr1/local/download_and_untar.sh
\ No newline at end of file
diff --git a/egs2/mini_an4/st1/local/path.sh b/egs2/mini_an4/st1/local/path.sh
new file mode 100755
index 00000000000..e69de29bb2d
diff --git a/egs2/mini_an4/st1/path.sh b/egs2/mini_an4/st1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/mini_an4/st1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/mini_an4/st1/pyscripts b/egs2/mini_an4/st1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/mini_an4/st1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/mini_an4/st1/run.sh b/egs2/mini_an4/st1/run.sh
new file mode 100755
index 00000000000..068f2cda5d2
--- /dev/null
+++ b/egs2/mini_an4/st1/run.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+src_case=lc.rm
+tgt_case=lc.rm
+st_config=conf/train_st.yaml
+
+./st.sh \
+    --src_lang en \
+    --tgt_lang en \
+    --src_token_type "bpe" \
+    --src_nbpe 30 \
+    --tgt_token_type "bpe" \
+    --tgt_nbpe 30 \
+    --src_case ${src_case} \
+    --tgt_case ${tgt_case} \
+    --src_bpe_train_text "data/train_nodev/text.${src_case}.en" \
+    --tgt_bpe_train_text "data/train_nodev/text.${tgt_case}.en" \
+    --use_lm false \
+    --token_joint false \
+    --st_config "${st_config}" \
+    --train_set "train_nodev" \
+    --valid_set "train_dev" \
+    --test_sets "train_dev test test_seg" \
+    --lm_train_text "data/train_nodev/text.${tgt_case}.en" "$@"
diff --git a/egs2/mini_an4/st1/scripts b/egs2/mini_an4/st1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/mini_an4/st1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/mini_an4/st1/st.sh b/egs2/mini_an4/st1/st.sh
new file mode 120000
index 00000000000..5c7465739e3
--- /dev/null
+++ b/egs2/mini_an4/st1/st.sh
@@ -0,0 +1 @@
+../../TEMPLATE/st1/st.sh
\ No newline at end of file
diff --git a/egs2/mini_an4/st1/steps b/egs2/mini_an4/st1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/mini_an4/st1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/mini_an4/st1/utils b/egs2/mini_an4/st1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/mini_an4/st1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/ml_openslr63/asr1/README.md b/egs2/ml_openslr63/asr1/README.md
new file mode 100644
index 00000000000..35485aec30c
--- /dev/null
+++ b/egs2/ml_openslr63/asr1/README.md
@@ -0,0 +1,99 @@
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+# RESULTS
+## Environments
+- date: `Sat Mar 19 20:34:49 UTC 2022`
+- python version: `3.9.10 | packaged by conda-forge | (main, Feb  1 2022, 21:24:11)  [GCC 9.4.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.10.1`
+- Git hash: `d2410457152872f63c51ee76ed746a6ea3153f09`
+  - Commit date: `Sat Mar 19 09:04:54 2022 +0000`
+- Pretrained Model
+  - Hugging Face Hub: 
+  https://huggingface.co/espnet/ml_openslr63
+
+## asr_train_asr_conformer_s3prlfrontend_hubert_fused_raw_ml_bpe150_sp
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_ml_bpe150_valid.loss.ave_asr_model_valid.acc.ave/dev_ml|369|2345|75.2|21.8|3.0|2.4|27.2|71.5|
+|decode_asr_lm_lm_train_lm_ml_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_ml|1062|6136|67.0|28.7|4.3|2.6|35.6|71.8|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_ml_bpe150_valid.loss.ave_asr_model_valid.acc.ave/dev_ml|369|21321|96.1|2.2|1.7|0.9|4.7|71.5|
+|decode_asr_lm_lm_train_lm_ml_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_ml|1062|57065|93.5|3.2|3.3|1.3|7.7|71.8|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_ml_bpe150_valid.loss.ave_asr_model_valid.acc.ave/dev_ml|369|13402|93.5|4.4|2.1|0.9|7.4|71.3|
+|decode_asr_lm_lm_train_lm_ml_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_ml|1062|35911|89.9|6.3|3.8|1.3|11.4|70.4|
+
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+# RESULTS
+## Environments
+- date: `Sat Mar 19 07:22:48 UTC 2022`
+- python version: `3.9.10 | packaged by conda-forge | (main, Feb  1 2022, 21:24:11)  [GCC 9.4.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.10.1`
+- Git hash: `813ee348e36db8a6f8d0d717be8767f938b2e62b`
+  - Commit date: `Fri Mar 18 11:12:20 2022 -0400`
+
+## asr_train_asr_conformer_s3prlfrontend_hubert_raw_ml_bpe150_sp
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_ml_bpe150_valid.loss.ave_asr_model_valid.acc.ave/dev_ml|369|2345|71.4|24.4|4.2|2.5|31.1|72.6|
+|decode_asr_lm_lm_train_lm_ml_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_ml|1062|6136|61.8|32.1|6.1|2.0|40.3|73.5|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_ml_bpe150_valid.loss.ave_asr_model_valid.acc.ave/dev_ml|369|21321|94.5|2.3|3.3|1.0|6.5|72.6|
+|decode_asr_lm_lm_train_lm_ml_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_ml|1062|57065|90.9|3.4|5.8|1.1|10.3|73.5|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_ml_bpe150_valid.loss.ave_asr_model_valid.acc.ave/dev_ml|369|13402|91.3|4.5|4.1|0.9|9.6|72.6|
+|decode_asr_lm_lm_train_lm_ml_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_ml|1062|35911|86.7|6.6|6.7|0.9|14.1|72.1|
+
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+# RESULTS
+## Environments
+- date: `Fri Mar 18 17:25:39 UTC 2022`
+- python version: `3.9.10 | packaged by conda-forge | (main, Feb  1 2022, 21:24:11)  [GCC 9.4.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.10.1`
+- Git hash: `9cb00370db63ced70ee39e1a2ba3137311842d44`
+  - Commit date: `Fri Mar 18 10:47:05 2022 -0400`
+
+## asr_train_asr_conformer5_raw_ml_bpe150_sp
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_ml_bpe150_valid.loss.ave_asr_model_valid.acc.ave/dev_ml|369|2345|71.0|25.5|3.5|2.4|31.4|73.2|
+|decode_asr_lm_lm_train_lm_ml_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_ml|1062|6136|63.0|32.1|4.9|2.2|39.2|73.2|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_ml_bpe150_valid.loss.ave_asr_model_valid.acc.ave/dev_ml|369|21321|94.3|3.3|2.4|1.3|7.0|73.2|
+|decode_asr_lm_lm_train_lm_ml_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_ml|1062|57065|91.1|4.8|4.0|1.5|10.4|73.2|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_ml_bpe150_valid.loss.ave_asr_model_valid.acc.ave/dev_ml|369|13402|90.7|6.2|3.1|1.4|10.6|72.9|
+|decode_asr_lm_lm_train_lm_ml_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_ml|1062|35911|86.7|8.6|4.6|1.6|14.8|71.8|
+
diff --git a/egs2/ml_openslr63/asr1/asr.sh b/egs2/ml_openslr63/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/ml_openslr63/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/ml_openslr63/asr1/cmd.sh b/egs2/ml_openslr63/asr1/cmd.sh
new file mode 120000
index 00000000000..f77e339f822
--- /dev/null
+++ b/egs2/ml_openslr63/asr1/cmd.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/cmd.sh
\ No newline at end of file
diff --git a/egs2/ml_openslr63/asr1/conf/decode_asr.yaml b/egs2/ml_openslr63/asr1/conf/decode_asr.yaml
new file mode 120000
index 00000000000..f3f59d5ac2b
--- /dev/null
+++ b/egs2/ml_openslr63/asr1/conf/decode_asr.yaml
@@ -0,0 +1 @@
+tuning/decode_transformer.yaml
\ No newline at end of file
diff --git a/egs2/ml_openslr63/asr1/conf/fbank.conf b/egs2/ml_openslr63/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/ml_openslr63/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/ml_openslr63/asr1/conf/pbs.conf b/egs2/ml_openslr63/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/ml_openslr63/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/ml_openslr63/asr1/conf/pitch.conf b/egs2/ml_openslr63/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/ml_openslr63/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/ml_openslr63/asr1/conf/queue.conf b/egs2/ml_openslr63/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/ml_openslr63/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/ml_openslr63/asr1/conf/slurm.conf b/egs2/ml_openslr63/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/ml_openslr63/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/ml_openslr63/asr1/conf/train_asr.yaml b/egs2/ml_openslr63/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..56ea1bf0c00
--- /dev/null
+++ b/egs2/ml_openslr63/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+./tuning/train_asr_conformer.yaml
\ No newline at end of file
diff --git a/egs2/ml_openslr63/asr1/conf/train_lm.yaml b/egs2/ml_openslr63/asr1/conf/train_lm.yaml
new file mode 100644
index 00000000000..bda020d1c57
--- /dev/null
+++ b/egs2/ml_openslr63/asr1/conf/train_lm.yaml
@@ -0,0 +1,14 @@
+lm_conf:
+    nlayers: 2
+    unit: 650
+optim: sgd        # or adam
+batch_type: folded
+batch_size: 64   # batch size in LM training
+max_epoch: 30      # if the data size is large, we can reduce this
+patience: 3
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
diff --git a/egs2/ml_openslr63/asr1/conf/tuning/decode_transformer.yaml b/egs2/ml_openslr63/asr1/conf/tuning/decode_transformer.yaml
new file mode 100644
index 00000000000..d89db079882
--- /dev/null
+++ b/egs2/ml_openslr63/asr1/conf/tuning/decode_transformer.yaml
@@ -0,0 +1,7 @@
+batch_size: 1
+beam_size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.5
+lm_weight: 0.3
diff --git a/egs2/ml_openslr63/asr1/conf/tuning/train_asr_conformer.yaml b/egs2/ml_openslr63/asr1/conf/tuning/train_asr_conformer.yaml
new file mode 100644
index 00000000000..b226d1b519f
--- /dev/null
+++ b/egs2/ml_openslr63/asr1/conf/tuning/train_asr_conformer.yaml
@@ -0,0 +1,78 @@
+# network architecture
+
+# frontend related
+frontend: default
+frontend_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 160
+
+# encoder related
+encoder: conformer
+encoder_conf:
+    input_layer: conv2d
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    macaron_style: true
+    use_cnn_module: true
+    cnn_module_kernel: 15
+
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: embed
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# optimization related
+optim: adam
+accum_grad: 1
+grad_clip: 3
+max_epoch: 50
+optim_conf:
+    lr: 4.0
+scheduler: noamlr
+scheduler_conf:
+    model_size: 256
+    warmup_steps: 25000
+
+# minibatch related
+batch_type: numel
+batch_bins: 2000000
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/ml_openslr63/asr1/conf/tuning/train_asr_conformer_s3prlfrontend_hubert.yaml b/egs2/ml_openslr63/asr1/conf/tuning/train_asr_conformer_s3prlfrontend_hubert.yaml
new file mode 100644
index 00000000000..6266111739d
--- /dev/null
+++ b/egs2/ml_openslr63/asr1/conf/tuning/train_asr_conformer_s3prlfrontend_hubert.yaml
@@ -0,0 +1,87 @@
+# network architecture
+
+freeze_param: [
+"frontend.upstream"
+]
+
+# frontend related
+frontend: s3prl
+frontend_conf:
+   frontend_conf:
+      upstream: hubert_large_ll60k  # Note: If the upstream is changed, please change the input_size in the preencoder.
+   download_dir: ./hub
+   multilayer_feature: True
+
+preencoder: linear
+preencoder_conf:
+   input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+   output_size: 80
+
+# encoder related
+encoder: conformer
+encoder_conf:
+    input_layer: conv2d
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    macaron_style: true
+    use_cnn_module: true
+    cnn_module_kernel: 15
+
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: embed
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# optimization related
+optim: adam
+accum_grad: 1
+grad_clip: 3
+max_epoch: 50
+optim_conf:
+    lr: 4.0
+scheduler: noamlr
+scheduler_conf:
+    warmup_steps: 25000
+
+# minibatch related
+batch_type: numel
+batch_bins: 2000000
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/ml_openslr63/asr1/conf/tuning/train_asr_conformer_s3prlfrontend_hubert_fused.yaml b/egs2/ml_openslr63/asr1/conf/tuning/train_asr_conformer_s3prlfrontend_hubert_fused.yaml
new file mode 100644
index 00000000000..9998618bf28
--- /dev/null
+++ b/egs2/ml_openslr63/asr1/conf/tuning/train_asr_conformer_s3prlfrontend_hubert_fused.yaml
@@ -0,0 +1,93 @@
+# network architecture
+
+# frontend related
+frontend: fused
+frontend_conf:
+   frontends:
+    - frontend_type: s3prl
+      frontend_conf:
+        upstream: hubert_large_ll60k
+      download_dir: ./hub
+      multilayer_feature: True
+      
+    - frontend_type: default
+      n_fft: 512
+      win_length: 400
+      hop_length: 160
+
+   align_method: linear_projection
+   proj_dim: 100 
+   
+preencoder: linear
+preencoder_conf:
+   input_size: 200 # Note: If the upstream is changed, please change this value accordingly.
+   output_size: 80
+
+# encoder related
+encoder: conformer
+encoder_conf:
+    input_layer: conv2d
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    macaron_style: true
+    use_cnn_module: true
+    cnn_module_kernel: 15
+
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: embed
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# optimization related
+optim: adam
+accum_grad: 1
+grad_clip: 3
+max_epoch: 50
+optim_conf:
+    lr: 0.5
+scheduler: noamlr
+scheduler_conf:
+    warmup_steps: 2500
+
+# minibatch related
+batch_type: numel
+batch_bins: 2000000
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/ml_openslr63/asr1/db.sh b/egs2/ml_openslr63/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/ml_openslr63/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/ml_openslr63/asr1/local/data.sh b/egs2/ml_openslr63/asr1/local/data.sh
new file mode 100755
index 00000000000..6b549f1f096
--- /dev/null
+++ b/egs2/ml_openslr63/asr1/local/data.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+# general configuration
+stage=0       # start from 0 if you need to start from data preparation
+stop_stage=1
+# inclusive, was 100
+SECONDS=0
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+. utils/parse_options.sh
+
+log "data preparation started"
+
+mkdir -p ${MALAYALAM}
+if [ -z "${MALAYALAM}" ]; then
+    log "Fill the value of 'MALAYALAM' of db.sh"
+    exit 1
+fi
+
+workspace=$PWD
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "sub-stage 0: Download Data to downloads"
+
+    cd ${MALAYALAM}
+    wget https://www.openslr.org/resources/63/ml_in_female.zip
+    unzip -o ml_in_female.zip
+    rm -f ml_in_female.zip
+    wget https://www.openslr.org/resources/63/ml_in_male.zip
+    unzip -o ml_in_male.zip
+    rm -f ml_in_male.zip
+
+    wget https://www.openslr.org/resources/63/line_index_female.tsv
+    wget https://www.openslr.org/resources/63/line_index_male.tsv
+    cat line_index_female.tsv line_index_male.tsv > line_index_all.tsv
+    cd $workspace
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "sub-stage 1: Preparing Data for openslr"
+
+    python3 local/data_prep.py -d ${MALAYALAM}
+    utils/spk2utt_to_utt2spk.pl data/train_ml/spk2utt > data/train_ml/utt2spk
+    utils/spk2utt_to_utt2spk.pl data/dev_ml/spk2utt > data/dev_ml/utt2spk
+    utils/spk2utt_to_utt2spk.pl data/test_ml/spk2utt > data/test_ml/utt2spk
+    utils/fix_data_dir.sh data/train_ml
+    utils/fix_data_dir.sh data/dev_ml
+    utils/fix_data_dir.sh data/test_ml
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/ml_openslr63/asr1/local/data_prep.py b/egs2/ml_openslr63/asr1/local/data_prep.py
new file mode 100644
index 00000000000..bd174f75e68
--- /dev/null
+++ b/egs2/ml_openslr63/asr1/local/data_prep.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+
+# Referred from data_prep.py in jv_openslr35 in ESPnet
+# https://github.com/espnet/espnet/blob/master/egs2/jv_openslr35/
+# asr1/local/data_prep.py
+
+
+import argparse
+import os
+import random
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-d", help="downloads directory", type=str, default="downloads")
+    args = parser.parse_args()
+
+    tsv_path = "%s/line_index_all.tsv" % args.d
+
+    with open(tsv_path, "r", encoding="utf-8") as inf:
+        tsv_lines = inf.readlines()
+    tsv_lines = [line.strip() for line in tsv_lines]
+
+    spk2utt = {}
+    utt2text = {}
+    for line in tsv_lines:
+        l_list = line.split("\t")
+        fid = l_list[0]
+        spk = fid.split("_")[1]
+        text = l_list[1]
+        text = text.replace(".", "")
+        text = text.replace(",", "")
+        text = text.lower()
+        path = "%s/%s.wav" % (args.d, fid)
+        if os.path.exists(path):
+            utt2text[fid] = text
+            if spk in spk2utt:
+                spk2utt[spk].append(fid)
+            else:
+                spk2utt[spk] = [fid]
+
+    spks = sorted(list(spk2utt.keys()))
+    num_fids = 0
+    num_test_spks = 0
+    for spk in spks:
+        num_test_spks += 1
+        fids = sorted(list(set(spk2utt[spk])))
+        num_fids += len(fids)
+        if num_fids >= 1000:
+            break
+    test_spks = spks[:num_test_spks]
+    train_dev_spks = spks[num_test_spks:]
+    random.Random(0).shuffle(train_dev_spks)
+    num_train = int(len(train_dev_spks) * 0.9)
+    train_spks = train_dev_spks[:num_train]
+    dev_spks = train_dev_spks[num_train:]
+
+    spks_by_phase = {"train": train_spks, "dev": dev_spks, "test": test_spks}
+    flac_dir = "%s" % args.d
+    sr = 16000
+    for phase in spks_by_phase:
+        spks = spks_by_phase[phase]
+        text_strs = []
+        wav_scp_strs = []
+        spk2utt_strs = []
+        num_fids = 0
+        for spk in spks:
+            fids = sorted(list(set(spk2utt[spk])))
+            num_fids += len(fids)
+            if phase == "test" and num_fids > 1000:
+                curr_num_fids = num_fids - 1000
+                random.Random(1).shuffle(fids)
+                fids = fids[:curr_num_fids]
+            utts = [spk + "-" + f for f in fids]
+            utts_str = " ".join(utts)
+            spk2utt_strs.append("%s %s" % (spk, utts_str))
+            for fid, utt in zip(fids, utts):
+                cmd = "ffmpeg -i %s/%s.wav -f wav -ar %d -ab 16 -ac 1 - |" % (
+                    flac_dir,
+                    fid,
+                    sr,
+                )
+                text_strs.append("%s %s" % (utt, utt2text[fid]))
+                wav_scp_strs.append("%s %s" % (utt, cmd))
+        phase_dir = "data/%s_ml" % phase
+        if not os.path.exists(phase_dir):
+            os.makedirs(phase_dir)
+        text_strs = sorted(text_strs)
+        wav_scp_strs = sorted(wav_scp_strs)
+        spk2utt_strs = sorted(spk2utt_strs)
+        with open(os.path.join(phase_dir, "text"), "w+") as ouf:
+            for s in text_strs:
+                ouf.write("%s\n" % s)
+        with open(os.path.join(phase_dir, "wav.scp"), "w+") as ouf:
+            for s in wav_scp_strs:
+                ouf.write("%s\n" % s)
+        with open(os.path.join(phase_dir, "spk2utt"), "w+") as ouf:
+            for s in spk2utt_strs:
+                ouf.write("%s\n" % s)
diff --git a/egs2/ml_openslr63/asr1/local/path.sh b/egs2/ml_openslr63/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/ml_openslr63/asr1/path.sh b/egs2/ml_openslr63/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/ml_openslr63/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/ml_openslr63/asr1/pyscripts b/egs2/ml_openslr63/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/ml_openslr63/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/ml_openslr63/asr1/run.sh b/egs2/ml_openslr63/asr1/run.sh
new file mode 100644
index 00000000000..e085b5f0002
--- /dev/null
+++ b/egs2/ml_openslr63/asr1/run.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set="train_ml"
+train_dev="dev_ml"
+test_set="test_ml"
+
+asr_config=conf/train_asr.yaml
+inference_config=conf/decode_asr.yaml
+lm_config=conf/train_lm.yaml
+
+./asr.sh \
+    --ngpu 1 \
+    --lang "ml" \
+    --use_lm true \
+    --lm_config "${lm_config}" \
+    --token_type bpe \
+    --nbpe 150 \
+    --bpemode "unigram" \
+    --feats_type raw \
+    --speed_perturb_factors "0.9 1.0 1.1" \
+    --gpu_inference true \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${train_dev}" \
+    --test_sets "${train_dev} ${test_set}" \
+    --bpe_train_text "data/${train_set}/text" \
+    --lm_train_text "data/${train_set}/text"
diff --git a/egs2/ml_openslr63/asr1/scripts b/egs2/ml_openslr63/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/ml_openslr63/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/ml_openslr63/asr1/steps b/egs2/ml_openslr63/asr1/steps
new file mode 120000
index 00000000000..69ab7056139
--- /dev/null
+++ b/egs2/ml_openslr63/asr1/steps
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/steps
\ No newline at end of file
diff --git a/egs2/ml_openslr63/asr1/utils b/egs2/ml_openslr63/asr1/utils
new file mode 120000
index 00000000000..e18ae14b549
--- /dev/null
+++ b/egs2/ml_openslr63/asr1/utils
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/utils
\ No newline at end of file
diff --git a/egs2/mr_openslr64/asr1/README.md b/egs2/mr_openslr64/asr1/README.md
new file mode 100644
index 00000000000..0e6848f9c27
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/README.md
@@ -0,0 +1,36 @@
+# RESULTS
+## Environments
+- date: `Mon Mar 21 16:06:03 UTC 2022`
+- python version: `3.9.7 (default, Sep 16 2021, 13:09:58)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.11.0+cu102`
+- Git hash: `91325a1e58ca0b13494b94bf79b186b095fe0b58`
+  - Commit date: `Mon Mar 21 00:40:52 2022 +0000`
+
+## asr_train_asr_conformer_xlsr_raw_bpe150_sp
+
+This recipe is for the Marathi language and is trained on the [OpenSLR Marathi](https://www.openslr.org/64/) multi-speaker speech data set.
+
+The following results are obtained by using an XLSR frontend.
+
+Train ASR Config: [conf/tuning/train_asr_conformer_xlsr.yaml](conf/tuning/train_asr_conformer_xlsr.yaml)
+
+Trained Model: [espnet/marathi_openslr64](https://huggingface.co/espnet/marathi_openslr64)
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_batch_size1_asr_model_valid.acc.ave/marathi_test|299|3625|72.9|22.5|4.7|1.7|28.9|88.6|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_batch_size1_asr_model_valid.acc.ave/marathi_test|299|20557|91.4|3.1|5.5|1.9|10.5|88.6|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_batch_size1_asr_model_valid.acc.ave/marathi_test|299|13562|86.5|6.3|7.1|1.4|14.9|88.6|
diff --git a/egs2/mr_openslr64/asr1/asr.sh b/egs2/mr_openslr64/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/mr_openslr64/asr1/cmd.sh b/egs2/mr_openslr64/asr1/cmd.sh
new file mode 120000
index 00000000000..f77e339f822
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/cmd.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/cmd.sh
\ No newline at end of file
diff --git a/egs2/mr_openslr64/asr1/conf/decode_asr.yaml b/egs2/mr_openslr64/asr1/conf/decode_asr.yaml
new file mode 120000
index 00000000000..f3f59d5ac2b
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/decode_asr.yaml
@@ -0,0 +1 @@
+tuning/decode_transformer.yaml
\ No newline at end of file
diff --git a/egs2/mr_openslr64/asr1/conf/fbank.conf b/egs2/mr_openslr64/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/mr_openslr64/asr1/conf/pbs.conf b/egs2/mr_openslr64/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/mr_openslr64/asr1/conf/pitch.conf b/egs2/mr_openslr64/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/mr_openslr64/asr1/conf/queue.conf b/egs2/mr_openslr64/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/mr_openslr64/asr1/conf/slurm.conf b/egs2/mr_openslr64/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/mr_openslr64/asr1/conf/train_asr.yaml b/egs2/mr_openslr64/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..10fea4f0971
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+tuning/train_asr_conformer.yaml
\ No newline at end of file
diff --git a/egs2/mr_openslr64/asr1/conf/train_lm.yaml b/egs2/mr_openslr64/asr1/conf/train_lm.yaml
new file mode 100644
index 00000000000..5bc7b7fcbfc
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/train_lm.yaml
@@ -0,0 +1,14 @@
+lm_conf:
+    nlayers: 2
+    unit: 650
+optim: sgd        # or adam
+batch_type: folded
+batch_size: 64   # batch size in LM training
+max_epoch: 20      # if the data size is large, we can reduce this
+patience: 3
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
diff --git a/egs2/mr_openslr64/asr1/conf/tuning/decode_rnn.yaml b/egs2/mr_openslr64/asr1/conf/tuning/decode_rnn.yaml
new file mode 100644
index 00000000000..169f05a1ad9
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/tuning/decode_rnn.yaml
@@ -0,0 +1,6 @@
+lm_weight: 0.3
+beam_size: 20
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.6
diff --git a/egs2/mr_openslr64/asr1/conf/tuning/decode_transformer.yaml b/egs2/mr_openslr64/asr1/conf/tuning/decode_transformer.yaml
new file mode 100644
index 00000000000..d8671a16988
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/tuning/decode_transformer.yaml
@@ -0,0 +1,7 @@
+batch_size: 16
+beam_size: 20
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.5
+lm_weight: 0.3
diff --git a/egs2/mr_openslr64/asr1/conf/tuning/train_asr_conformer.yaml b/egs2/mr_openslr64/asr1/conf/tuning/train_asr_conformer.yaml
new file mode 100644
index 00000000000..f47a0df534c
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/tuning/train_asr_conformer.yaml
@@ -0,0 +1,67 @@
+batch_type: numel
+batch_bins: 10000
+accum_grad: 3
+max_epoch: 60
+patience: none
+init: xavier_uniform
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 5
+
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 3
+    dropout_rate: 0.3
+    positional_dropout_rate: 0.3
+    attention_dropout_rate: 0.3
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: false
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 17
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 3
+    dropout_rate: 0.3
+    positional_dropout_rate: 0.3
+    self_attention_dropout_rate: 0.3
+    src_attention_dropout_rate: 0.3
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 20000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/mr_openslr64/asr1/conf/tuning/train_asr_conformer_xlsr.yaml b/egs2/mr_openslr64/asr1/conf/tuning/train_asr_conformer_xlsr.yaml
new file mode 100644
index 00000000000..1dbd14da380
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/tuning/train_asr_conformer_xlsr.yaml
@@ -0,0 +1,88 @@
+batch_type: numel
+batch_bins: 10000
+accum_grad: 3
+max_epoch: 60
+patience: none
+init: xavier_uniform
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 5
+
+freeze_param: [
+"frontend.upstream"
+]
+
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+frontend: s3prl
+frontend_conf:
+  frontend_conf:
+    upstream: wav2vec2_xlsr  # Note: If the upstream is changed, please change the input_size in the preencoder.
+  download_dir: ./hub
+  multilayer_feature: True
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 80
+
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 3
+    dropout_rate: 0.3
+    positional_dropout_rate: 0.3
+    attention_dropout_rate: 0.3
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: false
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 17
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 3
+    dropout_rate: 0.3
+    positional_dropout_rate: 0.3
+    self_attention_dropout_rate: 0.3
+    src_attention_dropout_rate: 0.3
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+optim: adam
+optim_conf:
+    lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 20000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/mr_openslr64/asr1/conf/tuning/train_asr_rnn.yaml b/egs2/mr_openslr64/asr1/conf/tuning/train_asr_rnn.yaml
new file mode 100644
index 00000000000..f55ad9d72bd
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/tuning/train_asr_rnn.yaml
@@ -0,0 +1,35 @@
+# network architecture
+# encoder related
+encoder: vgg_rnn
+encoder_conf:
+    rnn_type: lstm     # encoder architecture type
+    bidirectional: True
+    use_projection: True
+    num_layers: 4
+    hidden_size: 1024
+    output_size: 1024
+
+# decoder related
+decoder: rnn
+decoder_conf:
+    num_layers: 2
+    hidden_size: 1024
+    sampling_probability: 0
+    att_conf:
+        atype: location
+        adim: 1024
+        aconv_chans: 10
+        aconv_filts: 100
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.5
+
+# minibatch related
+batch_size: 30
+
+# optimization related
+optim: adadelta
+max_epoch: 15
+patience: 3
+
diff --git a/egs2/mr_openslr64/asr1/conf/tuning/train_asr_transformer.yaml b/egs2/mr_openslr64/asr1/conf/tuning/train_asr_transformer.yaml
new file mode 100644
index 00000000000..a0a37a5c0e4
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/tuning/train_asr_transformer.yaml
@@ -0,0 +1,68 @@
+# This configuration requires 4 GPUs with 32GB memory
+batch_type: numel
+batch_bins: 10000
+accum_grad: 3
+max_epoch: 60
+patience: none
+init: xavier_uniform
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 5
+
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 3
+    dropout_rate: 0.3
+    positional_dropout_rate: 0.3
+    attention_dropout_rate: 0.3
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: false
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 17
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 3
+    dropout_rate: 0.3
+    positional_dropout_rate: 0.3
+    self_attention_dropout_rate: 0.3
+    src_attention_dropout_rate: 0.3
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 20000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
\ No newline at end of file
diff --git a/egs2/mr_openslr64/asr1/db.sh b/egs2/mr_openslr64/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/mr_openslr64/asr1/local/data.sh b/egs2/mr_openslr64/asr1/local/data.sh
new file mode 100755
index 00000000000..c296b907d59
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/local/data.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+# general configuration
+stage=0       # start from 0 if you need to start from data preparation
+stop_stage=1
+# inclusive, was 100
+SECONDS=0
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+. utils/parse_options.sh
+
+log "data preparation started"
+
+if [ -z "$MARATHI" ]; then
+    log "Variable MARATHI not set in db.sh"
+    exit 2
+fi
+
+mkdir -p ${MARATHI}
+
+workspace=$PWD
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "sub-stage 0: Download Data to downloads"
+
+    cd ${MARATHI}
+    wget https://www.openslr.org/resources/64/mr_in_female.zip
+    unzip -o mr_in_female.zip
+    rm -f mr_in_female.zip
+    
+    cd $workspace    
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "sub-stage 1: Preparing Data for openslr"
+
+    python3 local/data_prep.py -d ${MARATHI}
+    utils/spk2utt_to_utt2spk.pl data/marathi_train/spk2utt > data/marathi_train/utt2spk
+    utils/spk2utt_to_utt2spk.pl data/marathi_dev/spk2utt > data/marathi_dev/utt2spk
+    utils/spk2utt_to_utt2spk.pl data/marathi_test/spk2utt > data/marathi_test/utt2spk
+    utils/fix_data_dir.sh data/marathi_train
+    utils/fix_data_dir.sh data/marathi_dev
+    utils/fix_data_dir.sh data/marathi_test
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/mr_openslr64/asr1/local/data_prep.py b/egs2/mr_openslr64/asr1/local/data_prep.py
new file mode 100644
index 00000000000..ed446ef71ae
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/local/data_prep.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+
+# Copyright 2021 Carnegie Mellon University (Peter Wu)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+
+import argparse
+import os
+import random
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-d", help="downloads directory", type=str, default="downloads")
+    args = parser.parse_args()
+
+    tsv_path = "%s/line_index.tsv" % args.d
+
+    with open(tsv_path, "r") as inf:
+        tsv_lines = inf.readlines()
+    tsv_lines = [line.strip() for line in tsv_lines]
+
+    spk2utt = {}
+    utt2text = {}
+    for line in tsv_lines:
+        l_list = line.split("\t")
+        fid = l_list[0]
+        spk = l_list[0].split("_")[1]
+        text = l_list[1]
+        path = "%s/%s.wav" % (args.d, fid)
+        if os.path.exists(path):
+            utt2text[fid] = text
+            if spk in spk2utt:
+                spk2utt[spk].append(fid)
+            else:
+                spk2utt[spk] = [fid]
+
+    spks = sorted(list(spk2utt.keys()))
+    num_fids = 0
+    num_test_spks = 0
+    for spk in spks:
+        num_test_spks += 1
+        fids = sorted(list(set(spk2utt[spk])))
+        num_fids += len(fids)
+        if num_fids >= 2000:
+            break
+
+    num_test_spks = 2
+    test_spks = spks[:num_test_spks]
+    train_dev_spks = spks[num_test_spks:]
+    random.Random(0).shuffle(train_dev_spks)
+    num_train = int(len(train_dev_spks) * 0.9)
+    train_spks = train_dev_spks[:num_train]
+    dev_spks = train_dev_spks[num_train:]
+
+    spks_by_phase = {"train": train_spks, "dev": dev_spks, "test": test_spks}
+    flac_dir = "%s" % args.d
+    sr = 16000
+    for phase in spks_by_phase:
+        spks = spks_by_phase[phase]
+        text_strs = []
+        wav_scp_strs = []
+        spk2utt_strs = []
+        num_fids = 0
+        for spk in spks:
+            fids = sorted(list(set(spk2utt[spk])))
+            num_fids += len(fids)
+            if phase == "test" and num_fids > 2000:
+                curr_num_fids = num_fids - 2000
+                random.Random(1).shuffle(fids)
+                fids = fids[:curr_num_fids]
+            utts = [spk + "-" + f for f in fids]
+            utts_str = " ".join(utts)
+            spk2utt_strs.append("%s %s" % (spk, utts_str))
+            for fid, utt in zip(fids, utts):
+                cmd = "ffmpeg -i %s/%s.wav -f wav -ar %d -ab 16 -ac 1 - |" % (
+                    flac_dir,
+                    fid,
+                    sr,
+                )
+                text_strs.append("%s %s" % (utt, utt2text[fid]))
+                wav_scp_strs.append("%s %s" % (utt, cmd))
+        phase_dir = "data/marathi_%s" % phase
+        if not os.path.exists(phase_dir):
+            os.makedirs(phase_dir)
+
+        text_strs = sorted(text_strs)
+        wav_scp_strs = sorted(wav_scp_strs)
+        spk2utt_strs = sorted(spk2utt_strs)
+
+        with open(os.path.join(phase_dir, "text"), "w+") as ouf:
+            for s in text_strs:
+                ouf.write("%s\n" % s)
+        with open(os.path.join(phase_dir, "wav.scp"), "w+") as ouf:
+            for s in wav_scp_strs:
+                ouf.write("%s\n" % s)
+        with open(os.path.join(phase_dir, "spk2utt"), "w+") as ouf:
+            for s in spk2utt_strs:
+                ouf.write("%s\n" % s)
diff --git a/egs2/mr_openslr64/asr1/local/path.sh b/egs2/mr_openslr64/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/mr_openslr64/asr1/path.sh b/egs2/mr_openslr64/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/mr_openslr64/asr1/pyscripts b/egs2/mr_openslr64/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/mr_openslr64/asr1/run.sh b/egs2/mr_openslr64/asr1/run.sh
new file mode 100755
index 00000000000..4b3fced2fb5
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/run.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+lid=false # whether to use language id as additional label
+
+train_set="marathi_train"
+train_dev="marathi_dev"
+test_set="marathi_test"
+
+asr_config=conf/tuning/train_asr_conformer_xlsr.yaml
+inference_config=conf/decode_asr.yaml
+
+ngpu=1
+
+./asr.sh \
+    --stage 1 \
+    --stop_stage 100 \
+    --ngpu ${ngpu} \
+    --nj 10 \
+    --inference_nj 10 \
+    --gpu_inference true \
+    --audio_format "wav" \
+    --inference_args "--batch_size 1" \
+    --use_lm false \
+    --token_type bpe \
+    --nbpe 150 \
+    --feats_type raw \
+    --feats_normalize utt_mvn \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${train_dev}" \
+    --test_sets "${test_set}" \
+    --speed_perturb_factors "0.9 1.0 1.1" \
+    --bpe_train_text "data/${train_set}/text" \
+    --lm_train_text "data/${train_set}/text" \
+    --local_score_opts "--score_lang_id ${lid}" "$@"
diff --git a/egs2/mr_openslr64/asr1/scripts b/egs2/mr_openslr64/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/mr_openslr64/asr1/steps b/egs2/mr_openslr64/asr1/steps
new file mode 120000
index 00000000000..69ab7056139
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/steps
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/steps
\ No newline at end of file
diff --git a/egs2/mr_openslr64/asr1/utils b/egs2/mr_openslr64/asr1/utils
new file mode 120000
index 00000000000..e18ae14b549
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/utils
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/utils
\ No newline at end of file
diff --git a/egs2/slue-voxpopuli/asr1/asr.sh b/egs2/slue-voxpopuli/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/slue-voxpopuli/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/slue-voxpopuli/asr1/cmd.sh b/egs2/slue-voxpopuli/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/slue-voxpopuli/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/slue-voxpopuli/asr1/conf/decode_asr.yaml b/egs2/slue-voxpopuli/asr1/conf/decode_asr.yaml
new file mode 100644
index 00000000000..4a05b1fae3b
--- /dev/null
+++ b/egs2/slue-voxpopuli/asr1/conf/decode_asr.yaml
@@ -0,0 +1,6 @@
+beam_size: 10
+ctc_weight: 0.3
+lm_weight: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+penalty: 0.0
diff --git a/egs2/slue-voxpopuli/asr1/conf/fbank.conf b/egs2/slue-voxpopuli/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/slue-voxpopuli/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/slue-voxpopuli/asr1/conf/pbs.conf b/egs2/slue-voxpopuli/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/slue-voxpopuli/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/slue-voxpopuli/asr1/conf/pitch.conf b/egs2/slue-voxpopuli/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/slue-voxpopuli/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/slue-voxpopuli/asr1/conf/queue.conf b/egs2/slue-voxpopuli/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/slue-voxpopuli/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/slue-voxpopuli/asr1/conf/slurm.conf b/egs2/slue-voxpopuli/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/slue-voxpopuli/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/slue-voxpopuli/asr1/conf/train_asr.yaml b/egs2/slue-voxpopuli/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..183f14e01f9
--- /dev/null
+++ b/egs2/slue-voxpopuli/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+tuning_wavlm/train_asr_conformer_lr2e-3_warmup5k_wavlm_conv2d2.yaml
\ No newline at end of file
diff --git a/egs2/slue-voxpopuli/asr1/conf/tuning_wavlm/train_asr_conformer_lr2e-3_warmup5k_wavlm_conv2d2.yaml b/egs2/slue-voxpopuli/asr1/conf/tuning_wavlm/train_asr_conformer_lr2e-3_warmup5k_wavlm_conv2d2.yaml
new file mode 100644
index 00000000000..e14cfbcd2b6
--- /dev/null
+++ b/egs2/slue-voxpopuli/asr1/conf/tuning_wavlm/train_asr_conformer_lr2e-3_warmup5k_wavlm_conv2d2.yaml
@@ -0,0 +1,88 @@
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 27
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_ratio_range:
+    - 0.
+    - 0.05
+    num_time_mask: 2
+
+encoder: conformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d2
+    normalize_before: true
+    macaron_style: true
+    rel_pos_type: latest
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    use_cnn_module: true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: false means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend
+
+frontend: s3prl
+frontend_conf:
+    frontend_conf:
+        upstream: wavlm_large  # Note: If the upstream is changed, please change the input_size in the preencoder.
+    download_dir: ./hub
+    multilayer_feature: true
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 80
+
+freeze_param:
+    - "frontend.upstream"
+
+seed: 2022
+log_interval: 100
+num_workers: 2
+batch_type: numel
+batch_bins: 6000000
+accum_grad: 2
+max_epoch: 70
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+    lr: 0.002
+    weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 5000
diff --git a/egs2/slue-voxpopuli/asr1/db.sh b/egs2/slue-voxpopuli/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/slue-voxpopuli/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/slue-voxpopuli/asr1/local/data.sh b/egs2/slue-voxpopuli/asr1/local/data.sh
new file mode 100755
index 00000000000..99112c53e59
--- /dev/null
+++ b/egs2/slue-voxpopuli/asr1/local/data.sh
@@ -0,0 +1,56 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+
+stage=1
+stop_stage=100000
+log "$0 $*"
+. utils/parse_options.sh
+
+. ./db.sh
+. ./path.sh
+. ./cmd.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+if [ -z "${VOXPOPULI}" ]; then
+    log "Fill the value of 'VOXPOPULI' of db.sh"
+    exit 1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    if [ ! -e "${VOXPOPULI}/LICENSE.txt" ]; then
+	echo "stage 1: Download data to ${VOXPOPULI}"
+    else
+        log "stage 1: ${VOXPOPULI}/LICENSE.txt is already existing. Skip data downloading"
+    fi
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: Data Preparation"
+
+    mkdir -p data/{train,devel,test}
+    python3 local/data_prep_original_slue_format.py ${VOXPOPULI}
+    for x in test devel train; do
+        for f in text wav.scp utt2spk transcript; do
+            sort data/${x}/${f} -o data/${x}/${f}
+        done
+        utils/utt2spk_to_spk2utt.pl data/${x}/utt2spk > "data/${x}/spk2utt"
+        utils/validate_data_dir.sh --no-feats data/${x} || exit 1
+    done
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/slue-voxpopuli/asr1/local/data_prep_original_slue_format.py b/egs2/slue-voxpopuli/asr1/local/data_prep_original_slue_format.py
new file mode 100644
index 00000000000..005da336b83
--- /dev/null
+++ b/egs2/slue-voxpopuli/asr1/local/data_prep_original_slue_format.py
@@ -0,0 +1,122 @@
+#!/usr/bin/env python3
+
+import os
+import pandas as pd
+import re
+import sys
+
+if len(sys.argv) != 2:
+    print("Usage: python data_prep.py [root]")
+    sys.exit(1)
+root = sys.argv[1]
+
+dir_dict = {
+    "train": "slue-voxpopuli_fine-tune.tsv",
+    "devel": "slue-voxpopuli_dev.tsv",
+    "test": "slue-voxpopuli_test_blind.tsv",
+}
+
+ontonotes_to_combined_label = {
+    "GPE": "PLACE",
+    "LOC": "PLACE",
+    "CARDINAL": "QUANT",
+    "MONEY": "QUANT",
+    "ORDINAL": "QUANT",
+    "PERCENT": "QUANT",
+    "QUANTITY": "QUANT",
+    "ORG": "ORG",
+    "DATE": "WHEN",
+    "TIME": "WHEN",
+    "NORP": "NORP",
+    "PERSON": "PERSON",
+    "LAW": "LAW",
+}
+
+
+missing_count = 0
+missing_ent = set()
+
+for x in dir_dict:
+    with open(os.path.join("data", x, "text"), "w") as text_f, open(
+        os.path.join("data", x, "wav.scp"), "w"
+    ) as wav_scp_f, open(os.path.join("data", x, "utt2spk"), "w") as utt2spk_f:
+
+        text_f.truncate()
+        wav_scp_f.truncate()
+        utt2spk_f.truncate()
+        transcript_df = pd.read_csv(os.path.join(root, dir_dict[x]), sep="\t")
+        # lines = sorted(transcript_df.values, key=lambda s: s[0])
+        for row in transcript_df.values:
+            uttid = row[3] + "_" + row[0]
+            speaker = row[3]
+            if x == "train":
+                wav = "fine-tune/" + row[0] + ".ogg"
+            elif x == "devel":
+                wav = "dev/" + row[0] + ".ogg"
+            else:
+                wav = "test/" + row[0] + ".ogg"
+
+            transcript = row[2].lower()
+            entities = []
+            if x != "test":  # blind test set
+                if str(row[6]) != "None":
+                    for slot in row[6].split("], "):
+                        ent_type = (
+                            slot.split(",")[0]
+                            .replace("[", "")
+                            .replace("]", "")
+                            .replace('"', "")
+                            .replace("'", "")
+                        )
+                        if ent_type in ontonotes_to_combined_label:
+                            ent_type = ontonotes_to_combined_label[ent_type]
+                        else:
+                            missing_count += 1
+                            missing_ent.add(ent_type)
+                            continue
+                        fill_start = int(
+                            slot.split(",")[1]
+                            .replace("[", "")
+                            .replace("]", "")
+                            .replace('"', "")
+                            .replace("'", "")
+                            .replace(" ", "")
+                        )
+                        fill_len = int(
+                            slot.split(",")[2]
+                            .replace("[", "")
+                            .replace("]", "")
+                            .replace('"', "")
+                            .replace("'", "")
+                            .replace(" ", "")
+                        )
+                        filler = transcript[fill_start : fill_start + fill_len]
+                        entities.append(
+                            {
+                                "type": ent_type,
+                                "filler": filler,
+                                "filler_start": fill_start,
+                                "filler_end": fill_start + fill_len,
+                            }
+                        )
+            new_transcript = transcript[:]
+            for entity in entities:
+                new_transcript = (
+                    new_transcript[: entity["filler_start"]]
+                    + entity["type"]
+                    + " FILL "
+                    + entity["filler"].lower()
+                    + " SEP "
+                    + new_transcript[entity["filler_end"] :]
+                )
+
+            words = "{}".format(new_transcript).replace("<unk>", "unknown")
+            words = re.sub(r"[\.;?!]", "", words)
+            words = re.sub(r"\s+", " ", words)
+
+            text_f.write("{} {}\n".format(uttid, words))
+            utt2spk_f.write("{} {}\n".format(uttid, speaker))
+            wav_scp_f.write(f"{uttid} sox {os.path.join(root,wav)} -t wav -r 16k - |\n")
+
+print("Missing Entities", missing_ent)
+print("Missing Count", missing_count)
diff --git a/egs2/slue-voxpopuli/asr1/local/data_prep_original_slue_format_transcript.py b/egs2/slue-voxpopuli/asr1/local/data_prep_original_slue_format_transcript.py
new file mode 100644
index 00000000000..622c830e4cf
--- /dev/null
+++ b/egs2/slue-voxpopuli/asr1/local/data_prep_original_slue_format_transcript.py
@@ -0,0 +1,128 @@
+#!/usr/bin/env python3
+
+import os
+import pandas as pd
+import re
+import sys
+
+if len(sys.argv) != 2:
+    print("Usage: python data_prep.py [root]")
+    sys.exit(1)
+root = sys.argv[1]
+
+dir_dict = {
+    "train": "slue-voxpopuli_fine-tune.tsv",
+    "devel": "slue-voxpopuli_dev.tsv",
+    "test": "slue-voxpopuli_test_blind.tsv",
+}
+
+ontonotes_to_combined_label = {
+    "GPE": "PLACE",
+    "LOC": "PLACE",
+    "CARDINAL": "QUANT",
+    "MONEY": "QUANT",
+    "ORDINAL": "QUANT",
+    "PERCENT": "QUANT",
+    "QUANTITY": "QUANT",
+    "ORG": "ORG",
+    "DATE": "WHEN",
+    "TIME": "WHEN",
+    "NORP": "NORP",
+    "PERSON": "PERSON",
+    "LAW": "LAW",
+}
+
+
+missing_count = 0
+missing_ent = set()
+
+for x in dir_dict:
+    with open(os.path.join("data", x, "text"), "w") as text_f, open(
+        os.path.join("data", x, "transcript"), "w"
+    ) as transcript_f, open(os.path.join("data", x, "wav.scp"), "w") as wav_scp_f, open(
+        os.path.join("data", x, "utt2spk"), "w"
+    ) as utt2spk_f:
+
+        text_f.truncate()
+        wav_scp_f.truncate()
+        utt2spk_f.truncate()
+        transcript_df = pd.read_csv(os.path.join(root, dir_dict[x]), sep="\t")
+        # lines = sorted(transcript_df.values, key=lambda s: s[0])
+        for row in transcript_df.values:
+            uttid = row[3] + "_" + row[0]
+            speaker = row[3]
+            if x == "train":
+                wav = "fine-tune/" + row[0] + ".ogg"
+            elif x == "devel":
+                wav = "dev/" + row[0] + ".ogg"
+            else:
+                wav = "test/" + row[0] + ".ogg"
+
+            transcript = row[2].lower()
+            entities = []
+            if x != "test":  # blind test set
+                if str(row[6]) != "None":
+                    for slot in row[6].split("], "):
+                        ent_type = (
+                            slot.split(",")[0]
+                            .replace("[", "")
+                            .replace("]", "")
+                            .replace('"', "")
+                            .replace("'", "")
+                        )
+                        if ent_type in ontonotes_to_combined_label:
+                            ent_type = ontonotes_to_combined_label[ent_type]
+                        else:
+                            missing_count += 1
+                            missing_ent.add(ent_type)
+                            continue
+                        fill_start = int(
+                            slot.split(",")[1]
+                            .replace("[", "")
+                            .replace("]", "")
+                            .replace('"', "")
+                            .replace("'", "")
+                            .replace(" ", "")
+                        )
+                        fill_len = int(
+                            slot.split(",")[2]
+                            .replace("[", "")
+                            .replace("]", "")
+                            .replace('"', "")
+                            .replace("'", "")
+                            .replace(" ", "")
+                        )
+                        filler = transcript[fill_start : fill_start + fill_len]
+                        entities.append(
+                            {
+                                "type": ent_type,
+                                "filler": filler,
+                                "filler_start": fill_start,
+                                "filler_end": fill_start + fill_len,
+                            }
+                        )
+            new_transcript = transcript[:]
+            for entity in entities:
+                new_transcript = (
+                    new_transcript[: entity["filler_start"]]
+                    + entity["type"]
+                    + " FILL "
+                    + entity["filler"].lower()
+                    + " SEP "
+                    + new_transcript[entity["filler_end"] :]
+                )
+
+            words = "{}".format(new_transcript).replace("<unk>", "unknown")
+            words = re.sub(r"[\.;?!]", "", words)
+            words = re.sub(r"\s+", " ", words)
+
+            text_f.write("{} {}\n".format(uttid, words))
+            words = "{}".format(transcript).replace("<unk>", "unknown")
+            words = re.sub(r"[\.;?!]", "", words)
+            words = re.sub(r"\s+", " ", words)
+            transcript_f.write("{} {}\n".format(uttid, words))
+            utt2spk_f.write("{} {}\n".format(uttid, speaker))
+            wav_scp_f.write(f"{uttid} sox {os.path.join(root,wav)} -t wav -r 16k - |\n")
+
+print("Missing Entities", missing_ent)
+print("Missing Count", missing_count)
diff --git a/egs2/slue-voxpopuli/asr1/local/eval_utils.py b/egs2/slue-voxpopuli/asr1/local/eval_utils.py
new file mode 100644
index 00000000000..9310735eb56
--- /dev/null
+++ b/egs2/slue-voxpopuli/asr1/local/eval_utils.py
@@ -0,0 +1,152 @@
+from typing import List
+from collections import defaultdict
+import numpy as np
+import editdistance
+
+
+def get_ner_scores(all_gt, all_predictions):
+    """
+    Evalutes per-label and overall (micro and macro) metrics
+    of precision, recall, and fscore
+    Input:
+        all_gt/all_predictions:
+            List of list of tuples: (label, phrase, identifier)
+            Each list of tuples correspond to a sentence:
+                label: entity tag
+                phrase: entity phrase
+                tuple_identifier: identifier to differentiate
+                repeating (label, phrase) pairs
+    Returns:
+        Dictionary of metrics
+    Example:
+        List of GT (label, phrase) pairs of a sentence:
+        [(GPE, "eu"), (DATE, "today"), (GPE, "eu")]
+        all_gt: [(GPE, "eu", 0), (DATE, "today", 0), (GPE, "eu", 1)]
+    """
+    metrics = {}
+    stats = get_ner_stats(all_gt, all_predictions)
+    num_correct, num_gt, num_pred = 0, 0, 0
+    prec_lst, recall_lst, fscore_lst = [], [], []
+    for tag_name, tag_stats in stats.items():
+        precision, recall, fscore = get_metrics(
+            np.sum(tag_stats["tp"]),
+            np.sum(tag_stats["gt_cnt"]),
+            np.sum(tag_stats["pred_cnt"]),
+        )
+        _ = metrics.setdefault(tag_name, {})
+        metrics[tag_name]["precision"] = precision
+        metrics[tag_name]["recall"] = recall
+        metrics[tag_name]["fscore"] = fscore
+
+        num_correct += np.sum(tag_stats["tp"])
+        num_pred += np.sum(tag_stats["pred_cnt"])
+        num_gt += np.sum(tag_stats["gt_cnt"])
+
+        prec_lst.append(precision)
+        recall_lst.append(recall)
+        fscore_lst.append(fscore)
+
+    precision, recall, fscore = get_metrics(num_correct, num_gt, num_pred)
+    metrics["overall_micro"] = {}
+    metrics["overall_micro"]["precision"] = precision
+    metrics["overall_micro"]["recall"] = recall
+    metrics["overall_micro"]["fscore"] = fscore
+
+    metrics["overall_macro"] = {}
+    metrics["overall_macro"]["precision"] = np.mean(prec_lst)
+    metrics["overall_macro"]["recall"] = np.mean(recall_lst)
+    metrics["overall_macro"]["fscore"] = np.mean(fscore_lst)
+
+    return metrics
+
+
+def get_ner_stats(all_gt, all_predictions):
+    stats = {}
+    cnt = 0
+    for gt, pred in zip(all_gt, all_predictions):
+        entities_true = defaultdict(set)
+        entities_pred = defaultdict(set)
+        for type_name, entity_info1, entity_info2 in gt:
+            entities_true[type_name].add((entity_info1, entity_info2))
+        for type_name, entity_info1, entity_info2 in pred:
+            entities_pred[type_name].add((entity_info1, entity_info2))
+        target_names = sorted(set(entities_true.keys()) | set(entities_pred.keys()))
+        for tag_name in target_names:
+            _ = stats.setdefault(tag_name, {})
+            _ = stats[tag_name].setdefault("tp", [])
+            _ = stats[tag_name].setdefault("gt_cnt", [])
+            _ = stats[tag_name].setdefault("pred_cnt", [])
+            entities_true_type = entities_true.get(tag_name, set())
+            entities_pred_type = entities_pred.get(tag_name, set())
+            stats[tag_name]["tp"].append(len(entities_true_type & entities_pred_type))
+            stats[tag_name]["pred_cnt"].append(len(entities_pred_type))
+            stats[tag_name]["gt_cnt"].append(len(entities_true_type))
+    return stats
+
+
+def safe_divide(numerator, denominator):
+    numerator = np.array(numerator)
+    denominator = np.array(denominator)
+    mask = denominator == 0.0
+    denominator = denominator.copy()
+    denominator[mask] = 1  # avoid infs/nans
+    return numerator / denominator
+
+
+def ner_error_analysis(all_gt, all_predictions, gt_text):
+    """
+    Print out predictions and GT
+    all_gt: [GT] list of tuples of (label, phrase, identifier idx)
+    all_predictions: [hypothesis] list of tuples of (label, phrase, identifier idx)
+    gt_text: list of GT text sentences
+    """
+    analysis_examples_dct = {}
+    analysis_examples_dct["all"] = []
+    for idx, text in enumerate(gt_text):
+        if isinstance(text, list):
+            text = " ".join(text)
+        gt = all_gt[idx]
+        pred = all_predictions[idx]
+        entities_true = defaultdict(set)
+        entities_pred = defaultdict(set)
+        for type_name, entity_info1, entity_info2 in gt:
+            entities_true[type_name].add((entity_info1, entity_info2))
+        for type_name, entity_info1, entity_info2 in pred:
+            entities_pred[type_name].add((entity_info1, entity_info2))
+        target_names = sorted(set(entities_true.keys()) | set(entities_pred.keys()))
+        analysis_examples_dct["all"].append("\t".join([text, str(gt), str(pred)]))
+        for tag_name in target_names:
+            _ = analysis_examples_dct.setdefault(tag_name, [])
+            entities_true_type = entities_true.get(tag_name, set())
+            entities_pred_type = entities_pred.get(tag_name, set())
+            num_gt = len(entities_true_type)
+            num_correct = len(entities_true_type & entities_pred_type)
+
+            new_gt = [(item1, item2) for item1, item2, _ in gt]
+            new_pred = [(item1, item2) for item1, item2, _ in pred]
+            analysis_examples_dct[tag_name].append(
+                "\t".join([text, str(new_gt), str(new_pred)])
+            )
+
+    return analysis_examples_dct
+
+
+def get_metrics(num_correct, num_gt, num_pred):
+    precision = safe_divide([num_correct], [num_pred])
+    recall = safe_divide([num_correct], [num_gt])
+    fscore = safe_divide([2 * precision * recall], [(precision + recall)])
+    return precision[0], recall[0], fscore[0][0]
+
+
+def get_wer(refs: List[str], hyps: List[str]):
+    """
+    args:
+        refs (list of str): reference texts
+        hyps (list of str): hypothesis/prediction texts
+    """
+    n_words, n_errors = 0, 0
+    for ref, hyp in zip(refs, hyps):
+        ref, hyp = ref.split(), hyp.split()
+        n_words += len(ref)
+        n_errors += editdistance.eval(ref, hyp)
+    return safe_divide(n_errors, n_words)
diff --git a/egs2/slue-voxpopuli/asr1/local/path.sh b/egs2/slue-voxpopuli/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/slue-voxpopuli/asr1/local/run_spm.sh b/egs2/slue-voxpopuli/asr1/local/run_spm.sh
new file mode 100755
index 00000000000..9710cf5601e
--- /dev/null
+++ b/egs2/slue-voxpopuli/asr1/local/run_spm.sh
@@ -0,0 +1,38 @@
+# This script is called in data preparation step by local/data.sh
+# It takes the data prepared using token type word as input
+# It then trains a bpe model with "nbpe" number of tokens on the train transcript i.e. text after first word (intent)
+# It then encodes the transcript for train, valid and test using the trained bpe model 
+nbpe=1000 #try 100, 500, 1000
+bpemode=bpe #try unigram, bpe
+
+new_data=data_${bpemode}_${nbpe}
+dict=${new_data}/en_token_list/word/tokens.txt
+bpemodel=${new_data}/spm_train_${bpemode}${nbpe}
+
+cp -R data ${new_data}
+
+cut -d' ' -f2 data/train/text | sort | uniq > ${new_data}/intents.txt
+cut -d' ' -f3- data/train/text > ${new_data}/input.txt
+
+spm_train --input=${new_data}/input.txt \
+            --model_prefix=${bpemodel} \
+            --vocab_size=${nbpe} \
+            --character_coverage=1.0 \
+            --model_type=${bpemode} \
+            --model_prefix=${bpemodel} \
+            --input_sentence_size=100000000 \
+            --bos_id=-1 \
+            --eos_id=-1 \
+            --unk_id=0 
+
+for split in train devel test; do 
+    cut -d' ' -f-2 data/${split}/text > ${new_data}/tmp_${split}_utt
+    cut -d' ' -f3- data/${split}/text > ${new_data}/tmp_${split}_transcript
+    spm_encode --model=${bpemodel}.model --output_format=piece < ${new_data}/tmp_${split}_transcript > ${new_data}/new_${split}_transcript
+    paste -d' ' ${new_data}/tmp_${split}_utt ${new_data}/new_${split}_transcript > ${new_data}/${split}/text
+    rm ${new_data}/tmp_${split}_utt
+    rm ${new_data}/tmp_${split}_transcript
+    rm ${new_data}/new_${split}_transcript
+done
+
+#| awk '{print $0 " " NR+1}' >> ${dict}
diff --git a/egs2/slue-voxpopuli/asr1/local/score.py b/egs2/slue-voxpopuli/asr1/local/score.py
new file mode 100755
index 00000000000..4239ed2b151
--- /dev/null
+++ b/egs2/slue-voxpopuli/asr1/local/score.py
@@ -0,0 +1,232 @@
+#!/usr/bin/env bash
+
+# Copyright 2021  Siddhant Arora, Yifan Peng
+#           2021  Carnegie Mellon University
+# Apache 2.0
+
+import json
+import os
+import re
+import sys
+import pandas as pd
+import argparse
+
+import eval_utils
+
+ontonotes_to_combined_label = {
+    "GPE": "PLACE",
+    "LOC": "PLACE",
+    "CARDINAL": "QUANT",
+    "MONEY": "QUANT",
+    "ORDINAL": "QUANT",
+    "PERCENT": "QUANT",
+    "QUANTITY": "QUANT",
+    "ORG": "ORG",
+    "DATE": "WHEN",
+    "TIME": "WHEN",
+    "NORP": "NORP",
+    "PERSON": "PERSON",
+    "LAW": "LAW",
+}
+combined_labels = set(ontonotes_to_combined_label.values())
+special_tokens = list(combined_labels) + ["FILL", "SEP"]
+
+
+def preprocess_sentence(line):
+    # Ensure special tokens are never merged with others
+    for label in special_tokens:
+        line = line.replace(label, "▁" + label + "▁")
+
+    line = line.strip().replace(" ", "").replace("▁", " ")
+    line = re.sub(" +", " ", line).strip()  # remove consecutive spaces
+
+    # Also return the sentence without special tokens
+    valid_tokens = []
+    for token in line.split():
+        if token not in special_tokens:
+            valid_tokens.append(token)
+    valid_line = " ".join(valid_tokens)
+    valid_line = valid_line.replace(" 's", "'s")  # combine 's with the previous word
+
+    return line, valid_line
+
+
+def make_distinct(label_lst):
+    """
+    Make the label_lst distinct
+    """
+    tag2cnt, new_tag_lst = {}, []
+    if len(label_lst) > 0:
+        for tag_item in label_lst:
+            _ = tag2cnt.setdefault(tag_item, 0)
+            tag2cnt[tag_item] += 1
+            tag, wrd = tag_item
+            new_tag_lst.append((tag, wrd, tag2cnt[tag_item]))
+        assert len(new_tag_lst) == len(set(new_tag_lst))
+    return new_tag_lst
+
+
+def process_line(line, label_F1=False):
+    label_lst = []
+    line = line.replace("  ", " ")
+    wrd_lst = line.split(" ")
+    phrase_lst, is_entity, num_illegal_assigments = [], False, 0
+    for idx, wrd in enumerate(wrd_lst):
+        if wrd in combined_labels:
+            if is_entity:
+                phrase_lst = []
+                num_illegal_assigments += 1
+            is_entity = True
+            entity_tag = wrd
+        elif wrd == "SEP":
+            if is_entity:
+                if len(phrase_lst) > 0:
+                    phrase_lst.remove("FILL")
+                    if label_F1 is True:
+                        label_lst.append((entity_tag, "phrase"))
+                    else:
+                        label_lst.append((entity_tag, " ".join(phrase_lst)))
+                else:
+                    num_illegal_assignments += 1
+                phrase_lst = []
+                is_entity = False
+            else:
+                num_illegal_assigments += 1
+
+        else:
+            if is_entity:
+                phrase_lst.append(wrd)
+
+    return make_distinct(label_lst)
+
+
+def get_classification_result(hyp_file, ref_file, hyp_asr_file, ref_asr_file):
+    hyp_lines = [line for line in hyp_file]
+    ref_lines = [line for line in ref_file]
+
+    hyp_list = []
+    ref_list = []
+
+    hyp_asr_list = []
+    ref_asr_list = []
+
+    hyp_label_list = []
+    ref_label_list = []
+
+    for line_count in range(len(hyp_lines)):
+        hyp_tokens = hyp_lines[line_count].split()
+        ref_tokens = ref_lines[line_count].split()
+
+        # The last "word" is utt_id
+        hyp_id = hyp_tokens[-1]
+        ref_id = ref_tokens[-1]
+        assert hyp_id == ref_id, f"hyp_id: {hyp_id}, ref_id: {ref_id}"
+
+        # Remove utt_id
+        hyp_line = " ".join(hyp_tokens[:-1])
+        ref_line = " ".join(ref_tokens[:-1])
+
+        # De-tokenize
+        hyp_line, hyp_line_asr = preprocess_sentence(hyp_line)
+        ref_line, ref_line_asr = preprocess_sentence(ref_line)
+
+        # Save results for computing F1
+        hyp_list.append(process_line(hyp_line))
+        ref_list.append(process_line(ref_line))
+
+        # Pure ASR text without special tokens
+        hyp_asr_list.append(hyp_line_asr + "\t" + hyp_id)
+        ref_asr_list.append(ref_line_asr + "\t" + ref_id)
+
+        # Save results for computing label-F1
+        hyp_label_list.append(process_line(hyp_line, label_F1=True))
+        ref_label_list.append(process_line(ref_line, label_F1=True))
+
+    # NER F1 score
+    metrics = eval_utils.get_ner_scores(hyp_list, ref_list)
+
+    # Write ASR text for computing WER later
+    for ln in hyp_asr_list:
+        hyp_asr_file.write(ln + "\n")
+    for ln in ref_asr_list:
+        ref_asr_file.write(ln + "\n")
+
+    # NER label-F1 score
+    label_metrics = eval_utils.get_ner_scores(hyp_label_list, ref_label_list)
+
+    return metrics, label_metrics
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--exp_root", required=True, help="Directory to save experiments")
+parser.add_argument(
+    "--valid_folder",
+    default="decode_asr_asr_model_valid.acc.ave/devel/",
+    help="Directory inside exp_root containing inference on valid set",
+)
+parser.add_argument(
+    "--test_folder",
+    default="decode_asr_asr_model_valid.acc.ave/test/",
+    help="Directory inside exp_root containing inference on test set",
+)
+args = parser.parse_args()
+
+exp_root = args.exp_root
+valid_inference_folder = args.valid_folder
+test_inference_folder = args.test_folder
+
+# Read original tokenized text
+valid_hyp_file = open(
+    os.path.join(exp_root, valid_inference_folder + "score_ter/hyp.trn")
+)
+valid_ref_file = open(
+    os.path.join(exp_root, valid_inference_folder + "score_ter/ref.trn")
+)
+
+# Write detokenized text
+valid_hyp_asr_file = open(
+    os.path.join(exp_root, valid_inference_folder + "score_ter/hyp_asr.trn"), "w"
+)
+valid_ref_asr_file = open(
+    os.path.join(exp_root, valid_inference_folder + "score_ter/ref_asr.trn"), "w"
+)
+
+result, label_result = get_classification_result(
+    valid_hyp_file, valid_ref_file, valid_hyp_asr_file, valid_ref_asr_file
+)
+print("Valid F1:")
+print(json.dumps(result, indent=4))
+print()
+print("Valid label-F1:")
+print(json.dumps(label_result, indent=4))
+print()
+
+
+if os.path.isdir(test_inference_folder):
+    # Read files
+    test_hyp_file = open(
+        os.path.join(exp_root, test_inference_folder + "score_ter/hyp.trn")
+    )
+    test_ref_file = open(
+        os.path.join(exp_root, test_inference_folder + "score_ter/ref.trn")
+    )
+
+    # Write files
+    test_hyp_asr_file = open(
+        os.path.join(exp_root, test_inference_folder + "score_ter/hyp_asr.trn"), "w"
+    )
+    test_ref_asr_file = open(
+        os.path.join(exp_root, test_inference_folder + "score_ter/ref_asr.trn"), "w"
+    )
+
+    result, label_result = get_classification_result(
+        test_hyp_file, test_ref_file, test_hyp_asr_file, test_ref_asr_file
+    )
+    print("Test F1:")
+    print(json.dumps(result, indent=4))
+    print()
+    print("Test label-F1:")
+    print(json.dumps(label_result, indent=4))
+    print()
+else:
+    print("[Warning] Skip F1 and label-F1 on test set as it does not exist.\n")
diff --git a/egs2/slue-voxpopuli/asr1/local/score.sh b/egs2/slue-voxpopuli/asr1/local/score.sh
new file mode 100755
index 00000000000..39e6661e841
--- /dev/null
+++ b/egs2/slue-voxpopuli/asr1/local/score.sh
@@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+# Copyright 2021  Siddhant Arora
+#           2021  Carnegie Mellon University
+
+# # begin configuration section.
+# cmd=run.pl
+# stage=0
+# data=data/eval2000
+# #end configuration section.
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -lt 1 ]; then
+  echo "Usage: local/score.sh <asr-exp-dir> <valid_inference_folder> <test_inference_folder>"
+  exit 1;
+fi
+. ./db.sh
+
+asr_expdir=$1
+
+if [ $# -gt 1 ]; then
+	valid_inference_folder=$2
+	test_inference_folder=$3
+	python local/score.py --exp_root ${asr_expdir} --valid_folder ${valid_inference_folder} --test_folder ${test_inference_folder}
+else
+	valid_inference_folder=decode_asr_asr_model_valid.acc.ave/devel
+	test_inference_folder=decode_asr_asr_model_valid.acc.ave/test
+	python local/score.py --exp_root ${asr_expdir}
+fi
+
+sclite \
+    -r "${asr_expdir}/${valid_inference_folder}/score_ter/ref_asr.trn" trn \
+    -h "${asr_expdir}/${valid_inference_folder}/score_ter/hyp_asr.trn" trn \
+    -i rm -o all stdout > "${asr_expdir}/${valid_inference_folder}/score_ter/result_asr.txt"
+echo "Write ASR result in ${asr_expdir}/${valid_inference_folder}/score_ter/result_asr.txt"
+grep -e Avg -e SPKR -m 2 "${asr_expdir}/${valid_inference_folder}/score_ter/result_asr.txt"
+
+if [ -d "${test_inference_folder}" ]; then
+	sclite \
+		-r "${asr_expdir}/${test_inference_folder}/score_ter/ref_asr.trn" trn \
+		-h "${asr_expdir}/${test_inference_folder}/score_ter/hyp_asr.trn" trn \
+		-i rm -o all stdout > "${asr_expdir}/${test_inference_folder}/score_ter/result_asr.txt"
+	echo "Write ASR result in ${asr_expdir}/${test_inference_folder}/score_ter/result_asr.txt"
+	grep -e Avg -e SPKR -m 2 "${asr_expdir}/${test_inference_folder}/score_ter/result_asr.txt"
+else
+	echo "[Warning] Skip ASR result on test set as it does not exist."
+fi
+
+exit 0
diff --git a/egs2/slue-voxpopuli/asr1/path.sh b/egs2/slue-voxpopuli/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/slue-voxpopuli/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/slue-voxpopuli/asr1/pyscripts b/egs2/slue-voxpopuli/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/slue-voxpopuli/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/slue-voxpopuli/asr1/run.sh b/egs2/slue-voxpopuli/asr1/run.sh
new file mode 100755
index 00000000000..c3f8f2d9c90
--- /dev/null
+++ b/egs2/slue-voxpopuli/asr1/run.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set="train"
+valid_set="devel"
+test_sets="test devel"
+
+asr_config=conf/tuning_wavlm/train_asr_conformer_lr2e-3_warmup5k_wavlm_conv2d2.yaml
+inference_config=conf/decode_asr.yaml
+
+./asr.sh \
+    --lang en \
+    --ngpu 1 \
+    --use_lm false \
+    --token_type bpe \
+    --nbpe 1000 \
+    --bpe_nlsyms FILL,SEP,PLACE,QUANT,ORG,WHEN,NORP,PERSON,LAW \
+    --feats_type raw \
+    --audio_format "flac.ark" \
+    --speed_perturb_factors "0.9 1.0 1.1" \
+    --max_wav_duration 30 \
+    --feats_normalize utterance_mvn \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --lm_train_text "data/${train_set}/text" \
+    --bpe_train_text "data/${train_set}/text" "$@"
diff --git a/egs2/slue-voxpopuli/asr1/scripts b/egs2/slue-voxpopuli/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/slue-voxpopuli/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/slue-voxpopuli/asr1/steps b/egs2/slue-voxpopuli/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/slue-voxpopuli/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/slue-voxpopuli/asr1/utils b/egs2/slue-voxpopuli/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/slue-voxpopuli/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/slurp/asr1/conf/train_asr_streaming_transformer.yaml b/egs2/slurp/asr1/conf/train_asr_streaming_transformer.yaml
new file mode 100644
index 00000000000..939e8d9bdd8
--- /dev/null
+++ b/egs2/slurp/asr1/conf/train_asr_streaming_transformer.yaml
@@ -0,0 +1,69 @@
+# network architecture
+# encoder related
+encoder: contextual_block_transformer
+encoder_conf:
+    output_size: 512    # dimension of attention
+    attention_heads: 8
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+    block_size: 40
+    hop_size: 16
+    look_ahead: 16
+    init_average: true
+    ctx_pos_enc: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.5
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false
+
+# minibatch related
+batch_type: folded
+batch_size: 64
+max_epoch: 50
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+    lr: 0.0002
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
+num_att_plot: 0
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/wsj/asr1/README.md b/egs2/wsj/asr1/README.md
index f87e60e1991..95f4d70d278 100644
--- a/egs2/wsj/asr1/README.md
+++ b/egs2/wsj/asr1/README.md
@@ -53,6 +53,38 @@
 |decode_lm_lm_train_lm_transformer_en_char_valid.loss.ave_asr_model_valid.acc.ave/test_eval92|333|33341|99.3|0.3|0.4|0.1|0.8|32.4|
 
 
+## Mask-CTC
+
+- Training config: [conf/tuning/train_asr_transformer_maskctc.yaml](conf/tuning/train_asr_transformer_maskctc.yaml)
+- Inference config:  [conf/tuning/inference_asr_maskctc.yaml](conf/tuning/inference_asr_maskctc.yaml)
+- Pretrained model: https://huggingface.co/espnet/YosukeHiguchi_espnet2_wsj_asr_transformer_maskctc
+
+### Environments
+
+- date: `Wed Mar 23 04:54:11 JST 2022`
+- python version: `3.8.12 (default, Oct 12 2021, 13:49:34)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- chainer version: `chainer 6.0.0`
+- pytorch version: `pytorch 1.10.1`
+- Git hash: `f29fc9d34f98635bca9e9f7860f3f6cb04300146`
+  - Commit date: `Tue Mar 22 05:48:17 2022 +0900`
+
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|inference_asr_maskctc_asr_model_valid.cer_ctc.ave_10best/test_dev93|503|8234|87.2|11.6|1.2|1.0|13.9|79.3|
+|inference_asr_maskctc_asr_model_valid.cer_ctc.ave_10best/test_eval92|333|5643|90.1|9.2|0.7|1.1|11.0|71.5|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|inference_asr_maskctc_asr_model_valid.cer_ctc.ave_10best/test_dev93|503|48634|96.7|1.7|1.6|1.0|4.2|81.3|
+|inference_asr_maskctc_asr_model_valid.cer_ctc.ave_10best/test_eval92|333|33341|97.7|1.3|1.1|1.0|3.3|76.0|
+
+
 ## Using Transformer LM (ASR model is same as the above): lm_weight=1.2, ctc_weight=0.3, beam_size=20
 
 - ASR config: [conf/tuning/train_asr_transformer2.yaml](conf/tuning/train_asr_transformer2.yaml)
diff --git a/egs2/wsj/asr1/conf/tuning/train_asr_transformer_ctc.yaml b/egs2/wsj/asr1/conf/tuning/train_asr_transformer_ctc.yaml
new file mode 100644
index 00000000000..63989a65e7e
--- /dev/null
+++ b/egs2/wsj/asr1/conf/tuning/train_asr_transformer_ctc.yaml
@@ -0,0 +1,55 @@
+batch_type: folded
+batch_size: 32
+accum_grad: 8
+max_epoch: 300
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - cer_ctc
+    - min
+keep_nbest_models: 10
+
+model: espnet
+model_conf:
+    ctc_weight: 1.0
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+encoder: transformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d
+    normalize_before: true
+
+optim: adam
+optim_conf:
+    lr: 0.002
+    weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 15000
+
+num_att_plot: 0
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 27
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_ratio_range:
+    - 0.
+    - 0.05
+    num_time_mask: 5
\ No newline at end of file
diff --git a/egs2/wsj/asr1/conf/tuning/train_asr_transformer_maskctc.yaml b/egs2/wsj/asr1/conf/tuning/train_asr_transformer_maskctc.yaml
index 8f5204bef97..fa4cd100542 100644
--- a/egs2/wsj/asr1/conf/tuning/train_asr_transformer_maskctc.yaml
+++ b/egs2/wsj/asr1/conf/tuning/train_asr_transformer_maskctc.yaml
@@ -1,15 +1,16 @@
 batch_type: folded
 batch_size: 32
 accum_grad: 8
-max_epoch: 100
+max_epoch: 300
 patience: none
 init: none
 best_model_criterion:
 -   - valid
-    - acc_mlm
-    - max
+    - cer_ctc
+    - min
 keep_nbest_models: 10
 
+# specify model type as "maskctc"
 model: maskctc
 model_conf:
     ctc_weight: 0.3
@@ -28,6 +29,7 @@ encoder_conf:
     input_layer: conv2d
     normalize_before: true
 
+# Masked Language Model (MLM)-based decoder
 decoder: mlm
 decoder_conf:
     attention_heads: 4
diff --git a/egs2/wsj0_2mix/enh1/README.md b/egs2/wsj0_2mix/enh1/README.md
index b5040ed04ae..a952cacec80 100644
--- a/egs2/wsj0_2mix/enh1/README.md
+++ b/egs2/wsj0_2mix/enh1/README.md
@@ -75,6 +75,65 @@
 <!-- Generated by ./scripts/utils/show_enh_score.sh -->
 # RESULTS
 ## Environments
+- date: `Thu Feb 24 16:26:21 CST 2022`
+- python version: `3.8.10 (default, May 19 2021, 18:05:58)  [GCC 7.3.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.5.1+cu101`
+- Git hash: `c58adabbe1b83dcd0b616ecd336b4a0648334e2c`
+  - Commit date: `Wed Feb 16 14:20:38 2022 +0800`
+
+
+## enh_train_enh_dpcl_raw
+
+  - config: conf/tuning/train_enh_dpcl.yaml
+  - Pretrained model: https://huggingface.co/Yulinfeng/wsj0_2mix_enh_train_enh_dpcl_raw_valid.si_snr.ave
+
+|dataset|PESQ|STOI|SAR|SDR|SIR|SI_SNR|
+|---|---|---|---|---|---|---|
+|enhanced_cv_min_8k|2.18|0.84|9.63|8.59|17.31|8.04|
+|enhanced_tt_min_8k|2.15|0.84|9.51|8.45|17.22|7.91|
+
+<!-- Generated by ./scripts/utils/show_enh_score.sh -->
+# RESULTS
+## Environments
+- date: `Thu Mar  3 17:10:03 CST 2022`
+- python version: `3.8.10 (default, May 19 2021, 18:05:58)  [GCC 7.3.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.5.1+cu101`
+- Git hash: `ec1acec03d109f06d829b80862e0388f7234d0d1`
+  - Commit date: `Fri Feb 25 14:12:45 2022 +0800`
+
+
+## enh_train_enh_mdc_raw
+
+  - config: conf/tuning/train_enh_mdc.yaml
+  - Pretrained model: https://huggingface.co/Yulinfeng/wsj0_2mix_enh_train_enh_mdc_raw_valid.si_snr.ave
+
+|dataset|PESQ|STOI|SAR|SDR|SIR|SI_SNR|
+|---|---|---|---|---|---|---|
+|enhanced_cv_min_8k|2.20|0.84|9.62|8.57|17.27|8.03|
+|enhanced_tt_min_8k|2.18|0.85|9.56|8.50|17.28|7.97|
+
+<!-- Generated by ./scripts/utils/show_enh_score.sh -->
+# RESULTS
+## Environments
+- date: `Thu Mar  3 14:33:32 CST 2022`
+- python version: `3.8.10 (default, May 19 2021, 18:05:58)  [GCC 7.3.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.5.1+cu101`
+- Git hash: `ec1acec03d109f06d829b80862e0388f7234d0d1`
+  - Commit date: `Fri Feb 25 14:12:45 2022 +0800`
+
+
+## enh_train_enh_dan_tf_raw
+
+  - config: conf/tuning/train_enh_dan_tf.yaml
+  - Pretrained model: https://huggingface.co/Yulinfeng/wsj0_2mix_enh_train_enh_dan_tf_raw_valid.si_snr.ave
+
+|dataset|PESQ|STOI|SAR|SDR|SIR|SI_SNR|
+|---|---|---|---|---|---|---|
+|enhanced_cv_min_8k|2.68|0.88|12.28|11.01|18.03|10.48|
+|enhanced_tt_min_8k|2.68|0.89|12.10|10.84|17.98|10.30|
 - date: `Thu Mar  3 14:29:20 CST 2022`
 - python version: `3.8.12 (default, Oct 12 2021, 13:49:34)  [GCC 7.5.0]`
 - espnet version: `espnet 0.10.7a1`
@@ -91,3 +150,25 @@ config: conf/tuning/train_enh_dc_crn_mapping_snr.yaml
 |---|---|---|---|---|---|
 |enhanced_cv_min_8k|0.93|14.86|14.16|24.02|13.73|
 |enhanced_tt_min_8k|0.94|14.25|13.46|23.13|13.01|
+
+<!-- Generated by ./scripts/utils/show_enh_score.sh -->
+# RESULTS
+## Environments
+- date: `Thu Apr 14 09:47:05 UTC 2022`
+- python version: `3.8.12 (default, Oct 12 2021, 13:49:34)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.10.1+cu111`
+- Git hash: `9dbe4179b866b994f6914ef52ea7483696d22760`
+  - Commit date: `Wed Mar 16 13:25:26 2022 +0000`
+
+
+## svoice (multi-layer Si-SNR loss)
+
+ - config: conf/tuning/train_enh_svoice.yaml
+ - Pretrained model: https://huggingface.co/Zhaoheng/svoice_wsj0_2mix
+
+|dataset|STOI|SAR|SDR|SIR|SI_SNR|
+|---|---|---|---|---|---|
+|enhanced_cv_min_8k|0.97|21.44|20.98|32.21|20.67|
+|enhanced_tt_min_8k|0.98|21.41|20.96|32.27|20.66|
+
diff --git a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_dan_tf.yaml b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_dan_tf.yaml
new file mode 100644
index 00000000000..d1995a99894
--- /dev/null
+++ b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_dan_tf.yaml
@@ -0,0 +1,65 @@
+optim: adam
+init: xavier_uniform
+max_epoch: 100
+batch_type: folded
+batch_size: 8
+iterator_type: chunk
+chunk_length: 32000
+num_workers: 4
+optim_conf:
+    lr: 1.0e-04
+    eps: 1.0e-08
+    weight_decay: 1.0e-7
+patience: 10
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: reducelronplateau
+scheduler_conf:
+    mode: min
+    factor: 0.7
+    patience: 1
+
+# A list for criterions
+# The overlall loss in the multi-task learning will be:
+# loss = weight_1 * loss_1 + ... + weight_N * loss_N
+# The default `weight` for each sub-loss is 1.0
+criterions: 
+  # The first criterion
+  - name: mse
+    conf:
+      compute_on_mask: False
+      mask_type: PSM
+    # the wrapper for the current criterion
+    # PIT is widely used in the speech separation task
+    wrapper: pit
+    wrapper_conf:
+      weight: 1.0
+
+encoder: stft
+encoder_conf:
+    n_fft: 256
+    hop_length: 64
+decoder: stft
+decoder_conf:
+    n_fft: 256
+    hop_length: 64
+separator: dan
+separator_conf:
+    rnn_type: blstm
+    num_spk: 2
+    nonlinear: tanh
+    layer: 4
+    unit: 600
+    dropout: 0.1
+    emb_D: 20
+
+
diff --git a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_dpcl.yaml b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_dpcl.yaml
new file mode 100644
index 00000000000..58a06679107
--- /dev/null
+++ b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_dpcl.yaml
@@ -0,0 +1,62 @@
+optim: adam
+init: xavier_uniform
+max_epoch: 100
+batch_type: folded
+batch_size:  8
+num_workers: 4
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 1.0e-7
+patience: 10
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: reducelronplateau
+scheduler_conf:
+    mode: min
+    factor: 0.7
+    patience: 1
+
+# A list for criterions
+# The overlall loss in the multi-task learning will be:
+# loss = weight_1 * loss_1 + ... + weight_N * loss_N
+# The default `weight` for each sub-loss is 1.0
+criterions: 
+  # The first criterion
+  - name: dpcl
+    conf:
+      loss_type: dpcl # "dpcl" or "mdc", "dpcl" means the origin loss in Deep Clustering and "mdc" means Manifold-Aware Deep Clustering
+    # the wrapper for the current criterion
+    # PIT is widely used in the speech separation task
+    wrapper: dpcl
+    wrapper_conf:
+      weight: 1.0
+
+encoder: stft
+encoder_conf:
+    n_fft: 256
+    hop_length: 128
+decoder: stft
+decoder_conf:
+    n_fft: 256
+    hop_length: 128
+separator: dpcl
+separator_conf:
+    rnn_type: blstm
+    num_spk: 2
+    nonlinear: relu
+    layer: 2
+    unit: 500
+    dropout: 0.1
+    emb_D: 40
+
+
diff --git a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_dpcl_e2e.yaml b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_dpcl_e2e.yaml
new file mode 100644
index 00000000000..aba37266183
--- /dev/null
+++ b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_dpcl_e2e.yaml
@@ -0,0 +1,66 @@
+optim: adam
+init: xavier_uniform
+max_epoch: 100
+batch_type: folded
+batch_size:  8
+num_workers: 4
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 1.0e-7
+patience: 10
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: reducelronplateau
+scheduler_conf:
+    mode: min
+    factor: 0.7
+    patience: 1
+
+# A list for criterions
+# The overlall loss in the multi-task learning will be:
+# loss = weight_1 * loss_1 + ... + weight_N * loss_N
+# The default `weight` for each sub-loss is 1.0
+criterions: 
+  # The first criterion
+  - name: mse
+    conf:
+      compute_on_mask: False
+      mask_type: PSM
+    # the wrapper for the current criterion
+    # PIT is widely used in the speech separation task
+    wrapper: pit
+    wrapper_conf:
+      weight: 1.0
+
+encoder: stft
+encoder_conf:
+    n_fft: 256
+    hop_length: 128
+decoder: stft
+decoder_conf:
+    n_fft: 256
+    hop_length: 128
+separator: dpcl_e2e
+separator_conf:
+    rnn_type: blstm
+    num_spk: 2
+    nonlinear: relu
+    layer: 2
+    unit: 500
+    dropout: 0.1
+    emb_D: 40
+    alpha: 5.0
+    max_iteration: 100
+    threshold: 1.0e-05
+
+
diff --git a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_mdc.yaml b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_mdc.yaml
new file mode 100644
index 00000000000..c093aca6944
--- /dev/null
+++ b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_mdc.yaml
@@ -0,0 +1,62 @@
+optim: adam
+init: xavier_uniform
+max_epoch: 100
+batch_type: folded
+batch_size:  8
+num_workers: 4
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 1.0e-7
+patience: 10
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: reducelronplateau
+scheduler_conf:
+    mode: min
+    factor: 0.7
+    patience: 1
+
+# A list for criterions
+# The overlall loss in the multi-task learning will be:
+# loss = weight_1 * loss_1 + ... + weight_N * loss_N
+# The default `weight` for each sub-loss is 1.0
+criterions: 
+  # The first criterion
+  - name: dpcl
+    conf:
+      loss_type: mdc # "dpcl" or "mdc", "dpcl" means the origin loss in Deep Clustering and "mdc" means Manifold-Aware Deep Clustering
+    # the wrapper for the current criterion
+    # PIT is widely used in the speech separation task
+    wrapper: dpcl
+    wrapper_conf:
+      weight: 1.0
+
+encoder: stft
+encoder_conf:
+    n_fft: 256
+    hop_length: 128
+decoder: stft
+decoder_conf:
+    n_fft: 256
+    hop_length: 128
+separator: dpcl
+separator_conf:
+    rnn_type: blstm
+    num_spk: 2
+    nonlinear: relu
+    layer: 2
+    unit: 500
+    dropout: 0.1
+    emb_D: 40
+
+
diff --git a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_svoice.yaml b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_svoice.yaml
new file mode 100644
index 00000000000..8ee8dc1d529
--- /dev/null
+++ b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_svoice.yaml
@@ -0,0 +1,51 @@
+optim: adam
+init: xavier_uniform
+max_epoch: 150
+batch_type: folded
+batch_size: 8
+iterator_type: chunk
+chunk_length: 16000
+num_workers: 4
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 0
+patience: 20
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: reducelronplateau
+scheduler_conf:
+    mode: min
+    factor: 0.7
+    patience: 1
+
+encoder: same
+decoder: same
+separator: svoice
+separator_conf:
+    enc_dim: 128
+    kernel_size: 8
+    hidden_size: 128
+    num_spk: 2
+    num_layers: 6
+    segment_size: 128
+    input_normalize: False
+
+criterions: 
+  # The first criterion
+  - name: si_snr 
+    conf:
+      eps: 1.0e-7
+    wrapper: multilayer_pit
+    wrapper_conf:
+      weight: 1.0
+      independent_perm: True
diff --git a/egs2/zh_openslr38/asr1/README.md b/egs2/zh_openslr38/asr1/README.md
new file mode 100644
index 00000000000..e646b431596
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/README.md
@@ -0,0 +1,39 @@
+# Corpus
+**Free ST Chinese Mandarin Corpus**: a free Mandarin Chinese corpus collected by Surfingtech (www.surfing.ai). The dataset contains 102600 utterances from 855 speakers, for a total of 109.73 hours of speech. 
+
+Since all speakers have 120 utterances, we manually divide the data into train, dev, and test split with a ratio of 90-5-5 using speaker IDs, resulting in 769, 43, and 43 speakers in our train, dev, test split respectively. Utterances with the same speaker ID are kept in the same split.
+
+The original dataset contains duplicates sentences with the same transcript, but are spoken by different speakers. Although the waveforms are different for these duplicates, we still remove sentences in the test and development set that have duplicate transcripts in the training set, in order to eliminate any effect of training data leakage.
+
+Link: https://openslr.org/38
+
+# Results
+## Environments
+- python version: `3.9.10 | packaged by conda-forge | (main, Feb  1 2022, 21:24:11)  [GCC 9.4.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.10.1`
+- pretrained model: https://huggingface.co/espnet/zh_openslr38/blob/main/exp/asr_train_asr_conformer_raw_zh_char_sp/valid.acc.ave_10best.pth
+
+## Spectrum Features
+Code to reproduce:
+```./run.sh```
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_rnn_lm_lm_train_lm_transformer_zh_char_valid.loss.ave_asr_model_valid.acc.ave/dev|4322|46490|91.0|8.4|0.5|0.2|9.2|51.5|
+|decode_asr_rnn_lm_lm_train_lm_transformer_zh_char_valid.loss.ave_asr_model_valid.acc.ave/test|4167|45803|91.1|8.5|0.5|0.2|9.1|52.2|
+
+## HuBERT Self-Supervised Learning (SSLR)
+We provide the script to train with SSLR features via HuBERT. Due to the much longer training time with HuBERT, we only train for 24 epochs. The model does not show a lower CER over spectrum features, but training for longer may lead to improved results.
+
+Code to reproduce:
+```./local/run_sslr.sh```
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_zh_char_valid.loss.ave_asr_model_valid.acc.best/dev|4322|46490|90.8|8.6|0.6|0.2|9.4|51.9|
+|decode_asr_lm_lm_train_lm_zh_char_valid.loss.ave_asr_model_valid.acc.best/test|4167|45803|90.8|8.7|0.5|0.2|9.4|54.1|
diff --git a/egs2/zh_openslr38/asr1/asr.sh b/egs2/zh_openslr38/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/zh_openslr38/asr1/cmd.sh b/egs2/zh_openslr38/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/zh_openslr38/asr1/conf/decode_asr.yaml b/egs2/zh_openslr38/asr1/conf/decode_asr.yaml
new file mode 100644
index 00000000000..88fdbc20b91
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/conf/decode_asr.yaml
@@ -0,0 +1,6 @@
+beam_size: 20
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.6
+lm_weight: 0.3
diff --git a/egs2/zh_openslr38/asr1/conf/fbank.conf b/egs2/zh_openslr38/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/zh_openslr38/asr1/conf/pbs.conf b/egs2/zh_openslr38/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/zh_openslr38/asr1/conf/pitch.conf b/egs2/zh_openslr38/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/zh_openslr38/asr1/conf/queue.conf b/egs2/zh_openslr38/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/zh_openslr38/asr1/conf/slurm.conf b/egs2/zh_openslr38/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/zh_openslr38/asr1/conf/train_asr.yaml b/egs2/zh_openslr38/asr1/conf/train_asr.yaml
new file mode 100644
index 00000000000..98588892b1c
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/conf/train_asr.yaml
@@ -0,0 +1,76 @@
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    macaron_style: true
+    use_cnn_module: true
+    cnn_module_kernel: 15
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+# minibatch related
+batch_type: numel
+batch_bins: 4000000
+
+# optimization related
+accum_grad: 4
+grad_clip: 5
+max_epoch: 40
+val_scheduler_criterion:
+    - valid
+    - acc
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+   lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 30000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
\ No newline at end of file
diff --git a/egs2/zh_openslr38/asr1/conf/train_lm.yaml b/egs2/zh_openslr38/asr1/conf/train_lm.yaml
new file mode 100644
index 00000000000..6f12611bf06
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/conf/train_lm.yaml
@@ -0,0 +1,29 @@
+lm: transformer
+lm_conf:
+    pos_enc: null
+    embed_unit: 128
+    att_unit: 512
+    head: 8
+    unit: 2048
+    layer: 16
+    dropout_rate: 0.1
+
+# optimization related
+grad_clip: 5.0
+batch_type: numel
+batch_bins: 2000000
+accum_grad: 1
+max_epoch: 15  # 15epoch is enougth
+
+optim: adam
+optim_conf:
+   lr: 0.001
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 25000
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 10  # 10 is good.
\ No newline at end of file
diff --git a/egs2/zh_openslr38/asr1/conf/tuning/train_asr_sslr.yaml b/egs2/zh_openslr38/asr1/conf/tuning/train_asr_sslr.yaml
new file mode 100644
index 00000000000..03a410cded5
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/conf/tuning/train_asr_sslr.yaml
@@ -0,0 +1,89 @@
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    macaron_style: true
+    use_cnn_module: true
+    cnn_module_kernel: 15
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+# minibatch related
+batch_type: numel
+batch_bins: 4000000
+
+# optimization related
+accum_grad: 4
+grad_clip: 5
+max_epoch: 40
+val_scheduler_criterion:
+    - valid
+    - acc
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+   lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 30000
+
+frontend: s3prl
+frontend_conf:
+   frontend_conf:
+      upstream: hubert_large_ll60k  # Note: If the upstream is changed, please change the input_size in the preencoder.
+   download_dir: ./hub
+   multilayer_feature: True
+   
+preencoder: linear
+preencoder_conf:
+   input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+   output_size: 80
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/zh_openslr38/asr1/db.sh b/egs2/zh_openslr38/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/zh_openslr38/asr1/local/check_train_test_duplicate.py b/egs2/zh_openslr38/asr1/local/check_train_test_duplicate.py
new file mode 100644
index 00000000000..3c61d786c1d
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/local/check_train_test_duplicate.py
@@ -0,0 +1,61 @@
+from collections import Counter
+
+train_file = "data/train/text"
+
+train_lines = []
+with open(train_file) as f:
+    for line in f:
+        if not line:
+            continue
+        train_lines.append(line.split()[1])
+train_lines = set(train_lines)
+
+for test_name in ("test", "dev"):
+    test_file = f"data/{test_name}/text"
+
+    test_lines = []
+    test_uttids = []
+    with open(test_file) as f:
+        for line in f:
+            if not line:
+                continue
+            test_uttids.append(line.split()[0])
+            test_lines.append(line.split()[1])
+
+    count = 0
+    duplicate_uttids = []  # duplicate ids in the test file
+    for t, uttid in zip(test_lines, test_uttids):
+        if t in train_lines:
+            duplicate_uttids.append(uttid)
+            count += 1
+    duplicate_uttids = set(duplicate_uttids)
+    print(count, "duplicates in", test_name)
+
+    # if input("continue? [y/n]") == 'y':
+    # remove all instances of duplicate uttids in: spk2utt, text, utt2spk, wav.scp
+    with open(f"data/{test_name}/spk2utt", "r") as f:
+        # replace all uttid with empty string
+        text = f.read()
+        for uttid in duplicate_uttids:
+            text = text.replace(" " + uttid, "")
+        for line in text.split("\n"):
+            if not line:
+                continue
+            if len(line.strip().split(" ")) < 2:
+                print(f"removing {line} from spk2utt")
+                text = text.replace(line + "\n", "")
+    with open(f"data/{test_name}/spk2utt", "w") as f:
+        f.write(text)
+
+    for name in ("text", "utt2spk", "wav.scp"):
+        with open(f"data/{test_name}/{name}", "r") as f:
+            # remove all lines that contain ids that correspond to duplicate sentences
+            out_lines = []
+            for line in f:
+                if not line.split()[0] in duplicate_uttids:
+                    out_lines.append(line.strip())
+        with open(f"data/{test_name}/{name}", "w") as f:
+            f.write("\n".join(out_lines))
+            f.write("\n")
+    # else:
+    #     print("ok.")
diff --git a/egs2/zh_openslr38/asr1/local/data.sh b/egs2/zh_openslr38/asr1/local/data.sh
new file mode 100755
index 00000000000..a8ae818556d
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/local/data.sh
@@ -0,0 +1,84 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+help_message=$(cat << EOF
+Usage: $0
+
+Options:
+    --remove_archive (bool): true or false
+      With remove_archive=True, the archives will be removed after being successfully downloaded and un-tarred.
+EOF
+)
+SECONDS=0
+
+# Data preparation related
+data_url=www.openslr.org/resources/38
+remove_archive=false
+download_opt=
+
+log "$0 $*"
+
+
+. ./utils/parse_options.sh
+
+. ./db.sh
+. ./path.sh
+. ./cmd.sh
+
+
+if [ $# -gt 1 ]; then
+  log "${help_message}"
+  exit 2
+fi
+
+if "$remove_archive"; then
+  download_opt="--remove-archive"
+fi
+
+if [ -z "${ST_CMDS}" ]; then
+  log "Error: \$ST_CMDS is not set in db.sh."
+  exit 2
+fi
+
+
+log "Download data to ${ST_CMDS}"
+if [ ! -d "${ST_CMDS}" ]; then
+    mkdir -p "${ST_CMDS}"
+fi
+# To absolute path
+ST_CMDS=$(cd ${ST_CMDS}; pwd)
+
+echo local/data_download.sh ${download_opt} "${ST_CMDS}" "${data_url}" ST-CMDS-20170001_1-OS.tar.gz
+local/data_download.sh ${download_opt} "${ST_CMDS}" "${data_url}" ST-CMDS-20170001_1-OS.tar.gz
+
+log "Data Preparation"
+train_dir=data/train
+dev_dir=data/dev
+test_dir=data/test
+
+mkdir -p $train_dir
+mkdir -p $dev_dir
+mkdir -p $test_dir
+
+python3 local/data_split.py ${ST_CMDS}/ST-CMDS-20170001_1-OS
+
+for dir in $train_dir $dev_dir $test_dir; do
+  utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
+done
+
+python3 local/check_train_test_duplicate.py
+
+# validate formats
+utils/validate_data_dir.sh --no-feats data/train
+utils/validate_data_dir.sh --no-feats data/dev
+utils/validate_data_dir.sh --no-feats data/test
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
\ No newline at end of file
diff --git a/egs2/zh_openslr38/asr1/local/data_download.sh b/egs2/zh_openslr38/asr1/local/data_download.sh
new file mode 100755
index 00000000000..27edd73b27d
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/local/data_download.sh
@@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+
+remove_archive=false
+
+if [ "$1" == --remove-archive ]; then
+  remove_archive=true
+  shift
+fi
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [--remove-archive] <data-base> <url-base>"
+  echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/38"
+  echo "With --remove-archive it will remove the archive after successfully un-tarring it."
+fi
+
+data=$1
+url=$2
+
+if [ ! -d "$data" ]; then
+  echo "$0: no such directory $data"
+  exit 1;
+fi
+
+if [ -z "$url" ]; then
+  echo "$0: empty URL base."
+  exit 1;
+fi
+
+set -euo pipefail
+
+cwd=$(pwd)
+if [ ! -e "${data}/.complete" ]; then
+    mkdir -p "${data}"
+    cd "${data}" || exit 1;
+    wget $url/ST-CMDS-20170001_1-OS.tar.gz
+    tar xf ST-CMDS-20170001_1-OS.tar.gz
+
+    if $remove_archive; then
+        echo "$0: removing $data/ST-CMDS-20170001_1-OS.tar.gz file since --remove-archive option was supplied."
+        rm $data/ST-CMDS-20170001_1-OS.tar.gz
+    fi
+
+    cd "${cwd}" || exit 1;
+    echo "$0: Successfully downloaded and un-tarred $data/ST-CMDS-20170001_1-OS.tar.gz"
+    touch ${data}/.complete
+else
+    echo "$0: Already exists. Skip download."
+fi
+
+exit 0;
diff --git a/egs2/zh_openslr38/asr1/local/data_split.py b/egs2/zh_openslr38/asr1/local/data_split.py
new file mode 100644
index 00000000000..df952d304cd
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/local/data_split.py
@@ -0,0 +1,107 @@
+"""
+Split data to train, dev, test
+"""
+import sys
+import os
+from collections import defaultdict
+import random
+
+train_size = 0.9
+random.seed(1)
+
+data_dir = sys.argv[1]  # ST-CMDS-20170001_1-OS
+
+# create speaker id dictionary
+d = defaultdict(list)
+for fn in os.listdir(data_dir):
+    if not fn.endswith(".wav"):
+        continue
+    # 20170001P00001A0001.wav
+    prefix, s = fn.split("P")
+    try:
+        speaker, s = s.split("A")
+        letter = "A"
+    except ValueError:
+        speaker, s = s.split("I")
+        letter = "I"
+    utt, _ = s.split(".")
+    d[speaker + letter].append(utt)
+
+speaker_ids = list(d.keys())
+random.shuffle(speaker_ids)
+
+num_speakers = len(speaker_ids)
+assert (
+    num_speakers == 855
+), "Number of speakers should be 855 in Free ST Chinese Mandarin Corpus."
+
+num_train = int(train_size * num_speakers)
+num_test = int((num_speakers - num_train) / 2)
+
+train_speakers = speaker_ids[:num_train]
+dev_speakers = speaker_ids[num_train:-num_test]
+test_speakers = speaker_ids[-num_test:]
+
+print(
+    f"# train: {num_train}, # dev:{num_speakers-num_train-num_test}, # test:{num_test}"
+)
+
+
+def get_transcription(spk_id, utt_id):
+    text_fn = get_text_filename(spk_id, utt_id)
+    with open(text_fn) as f:
+        lines = f.readlines()
+    assert len(lines) == 1, f"More than one line in transription file:{text_fn}"
+    return lines[0]
+
+
+def get_text_filename(spk_id, utt_id):
+    return f"{data_dir}/20170001P{spk_id}{utt_id}.txt"
+
+
+def get_wav_filename(spk_id, utt_id):
+    return f"{data_dir}/20170001P{spk_id}{utt_id}.wav"
+
+
+def create_files(speakers, directory):
+    text_lines, scp_lines, utt2spk_lines = [], [], []
+    for spk_id in speakers:
+        for utt_id in d[spk_id]:
+            # add spk_id in front to make utt_id unique
+            unique_utt_id = spk_id + utt_id
+
+            transcription = get_transcription(spk_id, utt_id)
+            text_lines.append(f"{unique_utt_id} {transcription}\n")
+
+            wav_file_path = get_wav_filename(spk_id, utt_id)
+            scp_lines.append(f"{unique_utt_id} {wav_file_path}\n")
+
+            utt2spk_lines.append(f"{unique_utt_id} {spk_id}\n")
+
+    # sort
+    text_lines.sort()
+    scp_lines.sort()
+    utt2spk_lines.sort()
+
+    # write to file
+    with open(f"{directory}/text", "w+") as text_file:
+        text_file.writelines(text_lines)
+
+    with open(f"{directory}/wav.scp", "w+") as scp_file:
+        scp_file.writelines(scp_lines)
+
+    with open(f"{directory}/utt2spk", "w+") as utt2spk_file:
+        utt2spk_file.writelines(utt2spk_lines)
+
+
+print("Creating files for train...", end="")
+create_files(train_speakers, "data/train")
+print("Done.")
+
+print("Creating files for dev...", end="")
+create_files(dev_speakers, "data/dev")
+print("Done.")
+
+print("Creating files for test...", end="")
+create_files(test_speakers, "data/test")
+print("Done.")
diff --git a/egs2/zh_openslr38/asr1/local/path.sh b/egs2/zh_openslr38/asr1/local/path.sh
new file mode 100755
index 00000000000..cd186777d50
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/local/path.sh
@@ -0,0 +1 @@
+MAIN_ROOT=$PWD/../../..
\ No newline at end of file
diff --git a/egs2/zh_openslr38/asr1/local/run_sslr.sh b/egs2/zh_openslr38/asr1/local/run_sslr.sh
new file mode 100755
index 00000000000..0f924c725c5
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/local/run_sslr.sh
@@ -0,0 +1,44 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train
+valid_set=dev
+test_sets="dev test"
+
+asr_config=conf/tuning/train_asr_sslr.yaml
+inference_config=conf/decode_asr.yaml
+
+lm_config=conf/train_lm.yaml
+use_lm=true
+use_wordlm=false
+
+# speed perturbation related
+# (train_set will be "${train_set}_sp" if speed_perturb_factors is specified)
+speed_perturb_factors="0.9 1.0 1.1"
+
+./asr.sh                                               \
+    --lang zh                                          \
+    --audio_format wav                                 \
+    --feats_type raw                                   \
+    --token_type char                                  \
+    --use_lm ${use_lm}                                 \
+    --use_word_lm ${use_wordlm}                        \
+    --lm_config "${lm_config}"                         \
+    --asr_config "${asr_config}"                       \
+    --inference_config "${inference_config}"           \
+    --train_set "${train_set}"                         \
+    --valid_set "${valid_set}"                         \
+    --test_sets "${test_sets}"                         \
+    --speed_perturb_factors "${speed_perturb_factors}" \
+    --asr_speech_fold_length 512 \
+    --asr_text_fold_length 150 \
+    --lm_fold_length 150 \
+    --lm_train_text "data/${train_set}/text" "$@" \
+    --feats_normalize uttmvn \
+    --nj 1 \
+    --inference_asr_model valid.acc.best.pth \
+    --gpu_inference true
diff --git a/egs2/zh_openslr38/asr1/path.sh b/egs2/zh_openslr38/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/zh_openslr38/asr1/pyscripts b/egs2/zh_openslr38/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/zh_openslr38/asr1/run.sh b/egs2/zh_openslr38/asr1/run.sh
new file mode 100755
index 00000000000..8d443a09702
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/run.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train
+valid_set=dev
+test_sets="dev test"
+
+asr_config=conf/train_asr.yaml
+inference_config=conf/decode_asr.yaml
+
+lm_config=conf/train_lm.yaml
+use_lm=true
+use_wordlm=false
+
+# speed perturbation related
+# (train_set will be "${train_set}_sp" if speed_perturb_factors is specified)
+speed_perturb_factors="0.9 1.0 1.1"
+
+./asr.sh                                               \
+    --lang zh                                          \
+    --audio_format wav                                 \
+    --feats_type raw                                   \
+    --token_type char                                  \
+    --use_lm ${use_lm}                                 \
+    --use_word_lm ${use_wordlm}                        \
+    --lm_config "${lm_config}"                         \
+    --asr_config "${asr_config}"                       \
+    --inference_config "${inference_config}"           \
+    --train_set "${train_set}"                         \
+    --valid_set "${valid_set}"                         \
+    --test_sets "${test_sets}"                         \
+    --speed_perturb_factors "${speed_perturb_factors}" \
+    --asr_speech_fold_length 512 \
+    --asr_text_fold_length 150 \
+    --lm_fold_length 150 \
+    --lm_train_text "data/${train_set}/text" "$@"
diff --git a/egs2/zh_openslr38/asr1/scripts b/egs2/zh_openslr38/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/zh_openslr38/asr1/steps b/egs2/zh_openslr38/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/zh_openslr38/asr1/utils b/egs2/zh_openslr38/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/espnet/version.txt b/espnet/version.txt
index 574cb0d455e..94306f7cdd7 100644
--- a/espnet/version.txt
+++ b/espnet/version.txt
@@ -1 +1 @@
-0.10.7a1
+202204
diff --git a/espnet2/asr/espnet_model.py b/espnet2/asr/espnet_model.py
index 570912e02c2..08c10182a83 100644
--- a/espnet2/asr/espnet_model.py
+++ b/espnet2/asr/espnet_model.py
@@ -161,6 +161,7 @@ def forward(
         speech_lengths: torch.Tensor,
         text: torch.Tensor,
         text_lengths: torch.Tensor,
+        **kwargs,
     ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
         """Frontend + Encoder + Decoder + Calc loss
 
@@ -169,6 +170,7 @@ def forward(
             speech_lengths: (Batch, )
             text: (Batch, Length)
             text_lengths: (Batch,)
+            kwargs: "utt_id" is among the input.
         """
         assert text_lengths.dim() == 1, text_lengths.shape
         # Check that batch_size is unified
@@ -287,6 +289,7 @@ def collect_feats(
         speech_lengths: torch.Tensor,
         text: torch.Tensor,
         text_lengths: torch.Tensor,
+        **kwargs,
     ) -> Dict[str, torch.Tensor]:
         if self.extract_feats_in_collect_stats:
             feats, feats_lengths = self._extract_feats(speech, speech_lengths)
diff --git a/espnet2/asr/frontend/s3prl.py b/espnet2/asr/frontend/s3prl.py
index 4fe53970380..6a497e0fab7 100644
--- a/espnet2/asr/frontend/s3prl.py
+++ b/espnet2/asr/frontend/s3prl.py
@@ -86,10 +86,10 @@ def _get_upstream(self, frontend_conf):
 
         from s3prl.upstream.interfaces import Featurizer
 
-        if self.multilayer_feature is None:
-            feature_selection = "last_hidden_state"
-        else:
+        if self.multilayer_feature:
             feature_selection = "hidden_states"
+        else:
+            feature_selection = "last_hidden_state"
         s3prl_featurizer = Featurizer(
             upstream=s3prl_upstream,
             feature_selection=feature_selection,
@@ -123,8 +123,7 @@ def forward(
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         wavs = [wav[: input_lengths[i]] for i, wav in enumerate(input)]
         self.upstream.eval()
-        with torch.no_grad():
-            feats = self.upstream(wavs)
+        feats = self.upstream(wavs)
         feats = self.featurizer(wavs, feats)
 
         if self.args.tile_factor != 1:
diff --git a/espnet2/bin/asr_inference.py b/espnet2/bin/asr_inference.py
index fbb156d8229..fc6d75cb488 100755
--- a/espnet2/bin/asr_inference.py
+++ b/espnet2/bin/asr_inference.py
@@ -31,6 +31,7 @@
 from espnet2.asr.transducer.beam_search_transducer import Hypothesis as TransHypothesis
 from espnet2.fileio.datadir_writer import DatadirWriter
 from espnet2.tasks.asr import ASRTask
+from espnet2.tasks.enh_s2t import EnhS2TTask
 from espnet2.tasks.lm import LMTask
 from espnet2.text.build_tokenizer import build_tokenizer
 from espnet2.text.token_id_converter import TokenIDConverter
@@ -77,14 +78,29 @@ def __init__(
         penalty: float = 0.0,
         nbest: int = 1,
         streaming: bool = False,
+        enh_s2t_task: bool = False,
     ):
         assert check_argument_types()
 
+        task = ASRTask if not enh_s2t_task else EnhS2TTask
+
         # 1. Build ASR model
         scorers = {}
-        asr_model, asr_train_args = ASRTask.build_model_from_file(
+        asr_model, asr_train_args = task.build_model_from_file(
             asr_train_config, asr_model_file, device
         )
+        if enh_s2t_task:
+            asr_model.inherite_attributes(
+                inherite_s2t_attrs=[
+                    "ctc",
+                    "decoder",
+                    "eos",
+                    "joint_network",
+                    "sos",
+                    "token_list",
+                    "use_transducer_decoder",
+                ]
+            )
         asr_model.to(dtype=getattr(torch, dtype)).eval()
 
         decoder = asr_model.decoder
@@ -136,6 +152,7 @@ def __init__(
                 decoder=1.0 - ctc_weight,
                 ctc=ctc_weight,
                 lm=lm_weight,
+                ngram=ngram_weight,
                 length_bonus=penalty,
             )
             beam_search = BeamSearch(
@@ -347,6 +364,7 @@ def inference(
     allow_variable_data_keys: bool,
     transducer_conf: Optional[dict],
     streaming: bool,
+    enh_s2t_task: bool,
 ):
     assert check_argument_types()
     if batch_size > 1:
@@ -390,6 +408,7 @@ def inference(
         penalty=penalty,
         nbest=nbest,
         streaming=streaming,
+        enh_s2t_task=enh_s2t_task,
     )
     speech2text = Speech2Text.from_pretrained(
         model_tag=model_tag,
@@ -531,6 +550,12 @@ def get_parser():
         help="Pretrained model tag. If specify this option, *_train_config and "
         "*_file will be overwritten",
     )
+    group.add_argument(
+        "--enh_s2t_task",
+        type=str2bool,
+        default=False,
+        help="enhancement and asr joint model",
+    )
 
     group = parser.add_argument_group("Beam-search related")
     group.add_argument(
diff --git a/espnet2/bin/enh_inference.py b/espnet2/bin/enh_inference.py
index 84a37b5ff7f..2deed3250c5 100755
--- a/espnet2/bin/enh_inference.py
+++ b/espnet2/bin/enh_inference.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 import argparse
+from itertools import chain
 import logging
 from pathlib import Path
 import sys
@@ -15,6 +16,7 @@
 import torch
 from tqdm import trange
 from typeguard import check_argument_types
+import yaml
 
 from espnet.utils.cli_utils import get_commandline_args
 from espnet2.enh.loss.criterions.tf_domain import FrequencyDomainMSE
@@ -22,8 +24,10 @@
 from espnet2.enh.loss.wrappers.pit_solver import PITSolver
 from espnet2.fileio.sound_scp import SoundScpWriter
 from espnet2.tasks.enh import EnhancementTask
+from espnet2.tasks.enh_s2t import EnhS2TTask
 from espnet2.torch_utils.device_funcs import to_device
 from espnet2.torch_utils.set_all_random_seed import set_all_random_seed
+from espnet2.train.abs_espnet_model import AbsESPnetModel
 from espnet2.utils import config_argparse
 from espnet2.utils.types import str2bool
 from espnet2.utils.types import str2triple_str
@@ -33,6 +37,59 @@
 EPS = torch.finfo(torch.get_default_dtype()).eps
 
 
+def get_train_config(train_config, model_file=None):
+    if train_config is None:
+        assert model_file is not None, (
+            "The argument 'model_file' must be provided "
+            "if the argument 'train_config' is not specified."
+        )
+        train_config = Path(model_file).parent / "config.yaml"
+    else:
+        train_config = Path(train_config)
+    return train_config
+
+
+def recursive_dict_update(dict_org, dict_patch, verbose=False, log_prefix=""):
+    """Update `dict_org` with `dict_patch` in-place recursively."""
+    for key, value in dict_patch.items():
+        if key not in dict_org:
+            if verbose:
+                logging.info(
+                    "Overwriting config: [{}{}]: None -> {}".format(
+                        log_prefix, key, value
+                    )
+                )
+            dict_org[key] = value
+        elif isinstance(value, dict):
+            recursive_dict_update(
+                dict_org[key], value, verbose=verbose, log_prefix=f"{key}."
+            )
+        else:
+            if verbose and dict_org[key] != value:
+                logging.info(
+                    "Overwriting config: [{}{}]: {} -> {}".format(
+                        log_prefix, key, dict_org[key], value
+                    )
+                )
+            dict_org[key] = value
+
+
+def build_model_from_args_and_file(task, args, model_file, device):
+    model = task.build_model(args)
+    if not isinstance(model, AbsESPnetModel):
+        raise RuntimeError(
+            f"model must inherit {AbsESPnetModel.__name__}, but got {type(model)}"
+        )
+    model.to(device)
+    if model_file is not None:
+        if device == "cuda":
+            # NOTE(kamo): "cuda" for torch.load always indicates cuda:0
+            #   in PyTorch<=1.4
+            device = f"cuda:{torch.cuda.current_device()}"
+        model.load_state_dict(torch.load(model_file, map_location=device))
+    return model
+
+
 class SeparateSpeech:
     """SeparateSpeech class
 
@@ -49,6 +106,7 @@ def __init__(
         self,
         train_config: Union[Path, str] = None,
         model_file: Union[Path, str] = None,
+        inference_config: Union[Path, str] = None,
         segment_size: Optional[float] = None,
         hop_size: Optional[float] = None,
         normalize_segment_scale: bool = False,
@@ -57,13 +115,47 @@ def __init__(
         normalize_output_wav: bool = False,
         device: str = "cpu",
         dtype: str = "float32",
+        enh_s2t_task: bool = False,
     ):
         assert check_argument_types()
 
+        task = EnhancementTask if not enh_s2t_task else EnhS2TTask
+
         # 1. Build Enh model
-        enh_model, enh_train_args = EnhancementTask.build_model_from_file(
-            train_config, model_file, device
-        )
+
+        if inference_config is None:
+            enh_model, enh_train_args = task.build_model_from_file(
+                train_config, model_file, device
+            )
+        else:
+            # Overwrite model attributes
+            train_config = get_train_config(train_config, model_file=model_file)
+            with train_config.open("r", encoding="utf-8") as f:
+                train_args = yaml.safe_load(f)
+
+            with Path(inference_config).open("r", encoding="utf-8") as f:
+                infer_args = yaml.safe_load(f)
+
+            if enh_s2t_task:
+                arg_list = ("enh_encoder", "enh_separator", "enh_decoder")
+            else:
+                arg_list = ("encoder", "separator", "decoder")
+            supported_keys = list(chain(*[[k, k + "_conf"] for k in arg_list]))
+            for k in infer_args.keys():
+                if k not in supported_keys:
+                    raise ValueError(
+                        "Only the following top-level keys are supported: %s"
+                        % ", ".join(supported_keys)
+                    )
+
+            recursive_dict_update(train_args, infer_args, verbose=True)
+            enh_train_args = argparse.Namespace(**train_args)
+            enh_model = build_model_from_args_and_file(
+                task, enh_train_args, model_file, device
+            )
+
+        if enh_s2t_task:
+            enh_model = enh_model.enh_model
         enh_model.to(dtype=getattr(torch, dtype)).eval()
 
         self.device = device
@@ -305,6 +397,7 @@ def inference(
     train_config: Optional[str],
     model_file: Optional[str],
     model_tag: Optional[str],
+    inference_config: Optional[str],
     allow_variable_data_keys: bool,
     segment_size: Optional[float],
     hop_size: Optional[float],
@@ -312,6 +405,7 @@ def inference(
     show_progressbar: bool,
     ref_channel: Optional[int],
     normalize_output_wav: bool,
+    enh_s2t_task: bool,
 ):
     assert check_argument_types()
     if batch_size > 1:
@@ -336,6 +430,7 @@ def inference(
     separate_speech_kwargs = dict(
         train_config=train_config,
         model_file=model_file,
+        inference_config=inference_config,
         segment_size=segment_size,
         hop_size=hop_size,
         normalize_segment_scale=normalize_segment_scale,
@@ -344,6 +439,7 @@ def inference(
         normalize_output_wav=normalize_output_wav,
         device=device,
         dtype=dtype,
+        enh_s2t_task=enh_s2t_task,
     )
     separate_speech = SeparateSpeech.from_pretrained(
         model_tag=model_tag,
@@ -368,13 +464,15 @@ def inference(
     )
 
     # 4. Start for-loop
+    output_dir = Path(output_dir).expanduser().resolve()
     writers = []
     for i in range(separate_speech.num_spk):
         writers.append(
             SoundScpWriter(f"{output_dir}/wavs/{i + 1}", f"{output_dir}/spk{i + 1}.scp")
         )
 
-    for keys, batch in loader:
+    for i, (keys, batch) in enumerate(loader):
+        logging.info(f"[{i}] Enhancing {keys}")
         assert isinstance(batch, dict), type(batch)
         assert all(isinstance(s, str) for s in keys), keys
         _bs = len(next(iter(batch.values())))
@@ -465,6 +563,19 @@ def get_parser():
         help="Pretrained model tag. If specify this option, train_config and "
         "model_file will be overwritten",
     )
+    group.add_argument(
+        "--inference_config",
+        type=str_or_none,
+        default=None,
+        help="Optional configuration file for overwriting enh model attributes "
+        "during inference",
+    )
+    group.add_argument(
+        "--enh_s2t_task",
+        type=str2bool,
+        default=False,
+        help="enhancement and asr joint model",
+    )
 
     group = parser.add_argument_group("Data loading related")
     group.add_argument(
diff --git a/espnet2/bin/enh_s2t_train.py b/espnet2/bin/enh_s2t_train.py
new file mode 100755
index 00000000000..93194d3696d
--- /dev/null
+++ b/espnet2/bin/enh_s2t_train.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python3
+from espnet2.tasks.enh_s2t import EnhS2TTask
+
+
+def get_parser():
+    parser = EnhS2TTask.get_parser()
+    return parser
+
+
+def main(cmd=None):
+    r"""EnhS2T training.
+
+    Example:
+
+        % python enh_s2t_train.py enh_s2t --print_config --optim adadelta \
+                > conf/train_enh_s2t.yaml
+        % python enh_s2t_train.py --config conf/train_enh_s2t.yaml
+    """
+    EnhS2TTask.main(cmd=cmd)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/espnet2/bin/enh_scoring.py b/espnet2/bin/enh_scoring.py
index a0c9b3c32f6..1c42fbf1f6d 100755
--- a/espnet2/bin/enh_scoring.py
+++ b/espnet2/bin/enh_scoring.py
@@ -63,29 +63,29 @@ def scoring(
             if ref.ndim > inf.ndim:
                 # multi-channel reference and single-channel output
                 ref = ref[..., ref_channel]
-                assert ref.shape == inf.shape, (ref.shape, inf.shape)
             elif ref.ndim < inf.ndim:
                 # single-channel reference and multi-channel output
-                raise ValueError(
-                    "Reference must be multi-channel when the \
-                    network output is multi-channel."
-                )
+                inf = inf[..., ref_channel]
             elif ref.ndim == inf.ndim == 3:
                 # multi-channel reference and output
                 ref = ref[..., ref_channel]
                 inf = inf[..., ref_channel]
-
+            assert ref.shape == inf.shape, (ref.shape, inf.shape)
             sdr, sir, sar, perm = bss_eval_sources(ref, inf, compute_permutation=True)
 
             for i in range(num_spk):
                 stoi_score = stoi(ref[i], inf[int(perm[i])], fs_sig=sample_rate)
+                estoi_score = stoi(
+                    ref[i], inf[int(perm[i])], fs_sig=sample_rate, extended=True
+                )
                 si_snr_score = -float(
                     si_snr_loss(
                         torch.from_numpy(ref[i][None, ...]),
                         torch.from_numpy(inf[int(perm[i])][None, ...]),
                     )
                 )
-                writer[f"STOI_spk{i + 1}"][key] = str(stoi_score)
+                writer[f"STOI_spk{i + 1}"][key] = str(stoi_score * 100)  # in percentage
+                writer[f"ESTOI_spk{i + 1}"][key] = str(estoi_score * 100)
                 writer[f"SI_SNR_spk{i + 1}"][key] = str(si_snr_score)
                 writer[f"SDR_spk{i + 1}"][key] = str(sdr[i])
                 writer[f"SAR_spk{i + 1}"][key] = str(sar[i])
diff --git a/espnet2/bin/enh_train.py b/espnet2/bin/enh_train.py
index ca4708eb87e..9f535fcce6b 100755
--- a/espnet2/bin/enh_train.py
+++ b/espnet2/bin/enh_train.py
@@ -12,7 +12,7 @@ def main(cmd=None):
 
     Example:
 
-        % python enh_train.py asr --print_config --optim adadelta \
+        % python enh_train.py enh --print_config --optim adadelta \
                 > conf/train_enh.yaml
         % python enh_train.py --config conf/train_enh.yaml
     """
diff --git a/espnet2/bin/pack.py b/espnet2/bin/pack.py
index 21d7b657683..e6492445f7a 100755
--- a/espnet2/bin/pack.py
+++ b/espnet2/bin/pack.py
@@ -36,6 +36,12 @@ class DiarPackedContents(PackedContents):
     yaml_files = ["train_config"]
 
 
+class EnhS2TPackedContents(PackedContents):
+    # These names must be consistent with the argument of inference functions
+    files = ["enh_s2t_model_file", "lm_file"]
+    yaml_files = ["enh_s2t_train_config", "lm_train_config"]
+
+
 def add_arguments(parser: argparse.ArgumentParser, contents: Type[PackedContents]):
     parser.add_argument("--outpath", type=str, required=True)
     for key in contents.yaml_files:
@@ -56,6 +62,7 @@ def get_parser() -> argparse.ArgumentParser:
         ("tts", TTSPackedContents),
         ("enh", EnhPackedContents),
         ("diar", DiarPackedContents),
+        ("enh_s2t", EnhS2TPackedContents),
     ]:
         parser_asr = subparsers.add_parser(
             name,
diff --git a/espnet2/bin/st_inference.py b/espnet2/bin/st_inference.py
index 1758a3ea895..4cf9bc4d1a6 100755
--- a/espnet2/bin/st_inference.py
+++ b/espnet2/bin/st_inference.py
@@ -23,6 +23,7 @@
 from espnet.nets.scorers.length_bonus import LengthBonus
 from espnet.utils.cli_utils import get_commandline_args
 from espnet2.fileio.datadir_writer import DatadirWriter
+from espnet2.tasks.enh_s2t import EnhS2TTask
 from espnet2.tasks.lm import LMTask
 from espnet2.tasks.st import STTask
 from espnet2.text.build_tokenizer import build_tokenizer
@@ -67,14 +68,29 @@ def __init__(
         ngram_weight: float = 0.9,
         penalty: float = 0.0,
         nbest: int = 1,
+        enh_s2t_task: bool = False,
     ):
         assert check_argument_types()
 
+        task = STTask if not enh_s2t_task else EnhS2TTask
+
         # 1. Build ST model
         scorers = {}
-        st_model, st_train_args = STTask.build_model_from_file(
+        st_model, st_train_args = task.build_model_from_file(
             st_train_config, st_model_file, device
         )
+        if enh_s2t_task:
+            st_model.inherite_attributes(
+                inherite_s2t_attrs=[
+                    "ctc",
+                    "decoder",
+                    "eos",
+                    "joint_network",
+                    "sos",
+                    "token_list",
+                    "use_transducer_decoder",
+                ]
+            )
         st_model.to(dtype=getattr(torch, dtype)).eval()
 
         decoder = st_model.decoder
@@ -290,6 +306,7 @@ def inference(
     token_type: Optional[str],
     bpemodel: Optional[str],
     allow_variable_data_keys: bool,
+    enh_s2t_task: bool,
 ):
     assert check_argument_types()
     if batch_size > 1:
@@ -330,6 +347,7 @@ def inference(
         ngram_weight=ngram_weight,
         penalty=penalty,
         nbest=nbest,
+        enh_s2t_task=enh_s2t_task,
     )
     speech2text = Speech2Text.from_pretrained(
         model_tag=model_tag,
@@ -471,6 +489,12 @@ def get_parser():
         help="Pretrained model tag. If specify this option, *_train_config and "
         "*_file will be overwritten",
     )
+    group.add_argument(
+        "--enh_s2t_task",
+        type=str2bool,
+        default=False,
+        help="enhancement and asr joint model",
+    )
 
     group = parser.add_argument_group("Beam-search related")
     group.add_argument(
diff --git a/espnet2/bin/st_inference_streaming.py b/espnet2/bin/st_inference_streaming.py
new file mode 100644
index 00000000000..8be428f2441
--- /dev/null
+++ b/espnet2/bin/st_inference_streaming.py
@@ -0,0 +1,616 @@
+#!/usr/bin/env python3
+import argparse
+from espnet.nets.batch_beam_search_online import BatchBeamSearchOnline
+from espnet.nets.beam_search import Hypothesis
+from espnet.nets.pytorch_backend.transformer.subsampling import TooShortUttError
+from espnet.nets.scorer_interface import BatchScorerInterface
+from espnet.nets.scorers.length_bonus import LengthBonus
+from espnet.utils.cli_utils import get_commandline_args
+from espnet2.asr.encoder.contextual_block_transformer_encoder import (
+    ContextualBlockTransformerEncoder,  # noqa: H301
+)
+from espnet2.asr.encoder.contextual_block_conformer_encoder import (
+    ContextualBlockConformerEncoder,  # noqa: H301
+)
+from espnet2.fileio.datadir_writer import DatadirWriter
+from espnet2.tasks.lm import LMTask
+from espnet2.tasks.st import STTask
+from espnet2.text.build_tokenizer import build_tokenizer
+from espnet2.text.token_id_converter import TokenIDConverter
+from espnet2.torch_utils.device_funcs import to_device
+from espnet2.torch_utils.set_all_random_seed import set_all_random_seed
+from espnet2.utils import config_argparse
+from espnet2.utils.types import str2bool
+from espnet2.utils.types import str2triple_str
+from espnet2.utils.types import str_or_none
+import logging
+import math
+import numpy as np
+from pathlib import Path
+import sys
+import torch
+from typeguard import check_argument_types
+from typeguard import check_return_type
+from typing import List
+from typing import Optional
+from typing import Sequence
+from typing import Tuple
+from typing import Union
+
+
+class Speech2TextStreaming:
+    """Speech2TextStreaming class
+
+    Details in "Streaming Transformer ASR with Blockwise Synchronous Beam Search"
+    (https://arxiv.org/abs/2006.14941)
+
+    Examples:
+        >>> import soundfile
+        >>> speech2text = Speech2TextStreaming("asr_config.yml", "asr.pth")
+        >>> audio, rate = soundfile.read("speech.wav")
+        >>> speech2text(audio)
+        [(text, token, token_int, hypothesis object), ...]
+
+    """
+
+    def __init__(
+        self,
+        st_train_config: Union[Path, str],
+        st_model_file: Union[Path, str] = None,
+        lm_train_config: Union[Path, str] = None,
+        lm_file: Union[Path, str] = None,
+        token_type: str = None,
+        bpemodel: str = None,
+        device: str = "cpu",
+        maxlenratio: float = 0.0,
+        minlenratio: float = 0.0,
+        batch_size: int = 1,
+        dtype: str = "float32",
+        beam_size: int = 20,
+        lm_weight: float = 1.0,
+        penalty: float = 0.0,
+        nbest: int = 1,
+        disable_repetition_detection=False,
+        decoder_text_length_limit=0,
+        encoded_feat_length_limit=0,
+    ):
+        assert check_argument_types()
+
+        # 1. Build ST model
+        scorers = {}
+        st_model, st_train_args = STTask.build_model_from_file(
+            st_train_config, st_model_file, device
+        )
+        st_model.to(dtype=getattr(torch, dtype)).eval()
+
+        assert isinstance(
+            st_model.encoder, ContextualBlockTransformerEncoder
+        ) or isinstance(st_model.encoder, ContextualBlockConformerEncoder)
+
+        decoder = st_model.decoder
+        token_list = st_model.token_list
+        scorers.update(
+            decoder=decoder,
+            length_bonus=LengthBonus(len(token_list)),
+        )
+
+        # 2. Build Language model
+        if lm_train_config is not None:
+            lm, lm_train_args = LMTask.build_model_from_file(
+                lm_train_config, lm_file, device
+            )
+            scorers["lm"] = lm.lm
+
+        # 3. Build BeamSearch object
+        weights = dict(
+            decoder=1.0,
+            lm=lm_weight,
+            length_bonus=penalty,
+        )
+
+        assert "encoder_conf" in st_train_args
+        assert "look_ahead" in st_train_args.encoder_conf
+        assert "hop_size" in st_train_args.encoder_conf
+        assert "block_size" in st_train_args.encoder_conf
+        # look_ahead = st_train_args.encoder_conf['look_ahead']
+        # hop_size   = st_train_args.encoder_conf['hop_size']
+        # block_size = st_train_args.encoder_conf['block_size']
+
+        assert batch_size == 1
+
+        beam_search = BatchBeamSearchOnline(
+            beam_size=beam_size,
+            weights=weights,
+            scorers=scorers,
+            sos=st_model.sos,
+            eos=st_model.eos,
+            vocab_size=len(token_list),
+            token_list=token_list,
+            pre_beam_score_key="full",
+            disable_repetition_detection=disable_repetition_detection,
+            decoder_text_length_limit=decoder_text_length_limit,
+            encoded_feat_length_limit=encoded_feat_length_limit,
+        )
+
+        non_batch = [
+            k
+            for k, v in beam_search.full_scorers.items()
+            if not isinstance(v, BatchScorerInterface)
+        ]
+        assert len(non_batch) == 0
+
+        # TODO(karita): make all scorers batchfied
+        logging.info("BatchBeamSearchOnline implementation is selected.")
+
+        beam_search.to(device=device, dtype=getattr(torch, dtype)).eval()
+        for scorer in scorers.values():
+            if isinstance(scorer, torch.nn.Module):
+                scorer.to(device=device, dtype=getattr(torch, dtype)).eval()
+        logging.info(f"Beam_search: {beam_search}")
+        logging.info(f"Decoding device={device}, dtype={dtype}")
+
+        # 4. [Optional] Build Text converter: e.g. bpe-sym -> Text
+        if token_type is None:
+            token_type = st_train_args.token_type
+        if bpemodel is None:
+            bpemodel = st_train_args.bpemodel
+
+        if token_type is None:
+            tokenizer = None
+        elif token_type == "bpe":
+            if bpemodel is not None:
+                tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel)
+            else:
+                tokenizer = None
+        else:
+            tokenizer = build_tokenizer(token_type=token_type)
+        converter = TokenIDConverter(token_list=token_list)
+        logging.info(f"Text tokenizer: {tokenizer}")
+
+        self.st_model = st_model
+        self.st_train_args = st_train_args
+        self.converter = converter
+        self.tokenizer = tokenizer
+        self.beam_search = beam_search
+        self.maxlenratio = maxlenratio
+        self.minlenratio = minlenratio
+        self.device = device
+        self.dtype = dtype
+        self.nbest = nbest
+        if "n_fft" in st_train_args.frontend_conf:
+            self.n_fft = st_train_args.frontend_conf["n_fft"]
+        else:
+            self.n_fft = 512
+        if "hop_length" in st_train_args.frontend_conf:
+            self.hop_length = st_train_args.frontend_conf["hop_length"]
+        else:
+            self.hop_length = 128
+        if (
+            "win_length" in st_train_args.frontend_conf
+            and st_train_args.frontend_conf["win_length"] is not None
+        ):
+            self.win_length = st_train_args.frontend_conf["win_length"]
+        else:
+            self.win_length = self.n_fft
+
+        self.reset()
+
+    def reset(self):
+        self.frontend_states = None
+        self.encoder_states = None
+        self.beam_search.reset()
+
+    def apply_frontend(
+        self, speech: torch.Tensor, prev_states=None, is_final: bool = False
+    ):
+        if prev_states is not None:
+            buf = prev_states["waveform_buffer"]
+            speech = torch.cat([buf, speech], dim=0)
+
+        if is_final:
+            speech_to_process = speech
+            waveform_buffer = None
+        else:
+            n_frames = (
+                speech.size(0) - (self.win_length - self.hop_length)
+            ) // self.hop_length
+            n_residual = (
+                speech.size(0) - (self.win_length - self.hop_length)
+            ) % self.hop_length
+            speech_to_process = speech.narrow(
+                0, 0, (self.win_length - self.hop_length) + n_frames * self.hop_length
+            )
+            waveform_buffer = speech.narrow(
+                0,
+                speech.size(0) - (self.win_length - self.hop_length) - n_residual,
+                (self.win_length - self.hop_length) + n_residual,
+            ).clone()
+
+        # data: (Nsamples,) -> (1, Nsamples)
+        speech_to_process = speech_to_process.unsqueeze(0).to(
+            getattr(torch, self.dtype)
+        )
+        lengths = speech_to_process.new_full(
+            [1], dtype=torch.long, fill_value=speech_to_process.size(1)
+        )
+        batch = {"speech": speech_to_process, "speech_lengths": lengths}
+
+        # lenghts: (1,)
+        # a. To device
+        batch = to_device(batch, device=self.device)
+
+        feats, feats_lengths = self.st_model._extract_feats(**batch)
+        if self.st_model.normalize is not None:
+            feats, feats_lengths = self.st_model.normalize(feats, feats_lengths)
+
+        # Trimming
+        if is_final:
+            if prev_states is None:
+                pass
+            else:
+                feats = feats.narrow(
+                    1,
+                    math.ceil(math.ceil(self.win_length / self.hop_length) / 2),
+                    feats.size(1)
+                    - math.ceil(math.ceil(self.win_length / self.hop_length) / 2),
+                )
+        else:
+            if prev_states is None:
+                feats = feats.narrow(
+                    1,
+                    0,
+                    feats.size(1)
+                    - math.ceil(math.ceil(self.win_length / self.hop_length) / 2),
+                )
+            else:
+                feats = feats.narrow(
+                    1,
+                    math.ceil(math.ceil(self.win_length / self.hop_length) / 2),
+                    feats.size(1)
+                    - 2 * math.ceil(math.ceil(self.win_length / self.hop_length) / 2),
+                )
+
+        feats_lengths = feats.new_full([1], dtype=torch.long, fill_value=feats.size(1))
+
+        if is_final:
+            next_states = None
+        else:
+            next_states = {"waveform_buffer": waveform_buffer}
+        return feats, feats_lengths, next_states
+
+    @torch.no_grad()
+    def __call__(
+        self, speech: Union[torch.Tensor, np.ndarray], is_final: bool = True
+    ) -> List[Tuple[Optional[str], List[str], List[int], Hypothesis]]:
+        """Inference
+
+        Args:
+            data: Input speech data
+        Returns:
+            text, token, token_int, hyp
+
+        """
+        assert check_argument_types()
+
+        # Input as audio signal
+        if isinstance(speech, np.ndarray):
+            speech = torch.tensor(speech)
+
+        feats, feats_lengths, self.frontend_states = self.apply_frontend(
+            speech, self.frontend_states, is_final=is_final
+        )
+        enc, _, self.encoder_states = self.st_model.encoder(
+            feats,
+            feats_lengths,
+            self.encoder_states,
+            is_final=is_final,
+            infer_mode=True,
+        )
+        nbest_hyps = self.beam_search(
+            x=enc[0],
+            maxlenratio=self.maxlenratio,
+            minlenratio=self.minlenratio,
+            is_final=is_final,
+        )
+
+        ret = self.assemble_hyps(nbest_hyps)
+        if is_final:
+            self.reset()
+        return ret
+
+    def assemble_hyps(self, hyps):
+        nbest_hyps = hyps[: self.nbest]
+        results = []
+        for hyp in nbest_hyps:
+            assert isinstance(hyp, Hypothesis), type(hyp)
+
+            # remove sos/eos and get results
+            token_int = hyp.yseq[1:-1].tolist()
+
+            # remove blank symbol id, which is assumed to be 0
+            token_int = list(filter(lambda x: x != 0, token_int))
+
+            # Change integer-ids to tokens
+            token = self.converter.ids2tokens(token_int)
+
+            if self.tokenizer is not None:
+                text = self.tokenizer.tokens2text(token)
+            else:
+                text = None
+            results.append((text, token, token_int, hyp))
+
+        assert check_return_type(results)
+        return results
+
+
+def inference(
+    output_dir: str,
+    maxlenratio: float,
+    minlenratio: float,
+    batch_size: int,
+    dtype: str,
+    beam_size: int,
+    ngpu: int,
+    seed: int,
+    lm_weight: float,
+    penalty: float,
+    nbest: int,
+    num_workers: int,
+    log_level: Union[int, str],
+    data_path_and_name_and_type: Sequence[Tuple[str, str, str]],
+    key_file: Optional[str],
+    st_train_config: str,
+    st_model_file: str,
+    lm_train_config: Optional[str],
+    lm_file: Optional[str],
+    word_lm_train_config: Optional[str],
+    word_lm_file: Optional[str],
+    token_type: Optional[str],
+    bpemodel: Optional[str],
+    allow_variable_data_keys: bool,
+    sim_chunk_length: int,
+    disable_repetition_detection: bool,
+    encoded_feat_length_limit: int,
+    decoder_text_length_limit: int,
+):
+    assert check_argument_types()
+    if batch_size > 1:
+        raise NotImplementedError("batch decoding is not implemented")
+    if word_lm_train_config is not None:
+        raise NotImplementedError("Word LM is not implemented")
+    if ngpu > 1:
+        raise NotImplementedError("only single GPU decoding is supported")
+
+    logging.basicConfig(
+        level=log_level,
+        format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+    )
+
+    if ngpu >= 1:
+        device = "cuda"
+    else:
+        device = "cpu"
+
+    # 1. Set random-seed
+    set_all_random_seed(seed)
+
+    # 2. Build speech2text
+    speech2text = Speech2TextStreaming(
+        st_train_config=st_train_config,
+        st_model_file=st_model_file,
+        lm_train_config=lm_train_config,
+        lm_file=lm_file,
+        token_type=token_type,
+        bpemodel=bpemodel,
+        device=device,
+        maxlenratio=maxlenratio,
+        minlenratio=minlenratio,
+        dtype=dtype,
+        beam_size=beam_size,
+        lm_weight=lm_weight,
+        penalty=penalty,
+        nbest=nbest,
+        disable_repetition_detection=disable_repetition_detection,
+        decoder_text_length_limit=decoder_text_length_limit,
+        encoded_feat_length_limit=encoded_feat_length_limit,
+    )
+
+    # 3. Build data-iterator
+    loader = STTask.build_streaming_iterator(
+        data_path_and_name_and_type,
+        dtype=dtype,
+        batch_size=batch_size,
+        key_file=key_file,
+        num_workers=num_workers,
+        preprocess_fn=STTask.build_preprocess_fn(speech2text.st_train_args, False),
+        collate_fn=STTask.build_collate_fn(speech2text.st_train_args, False),
+        allow_variable_data_keys=allow_variable_data_keys,
+        inference=True,
+    )
+
+    # 7 .Start for-loop
+    # FIXME(kamo): The output format should be discussed about
+    with DatadirWriter(output_dir) as writer:
+        for keys, batch in loader:
+            assert isinstance(batch, dict), type(batch)
+            assert all(isinstance(s, str) for s in keys), keys
+            _bs = len(next(iter(batch.values())))
+            assert len(keys) == _bs, f"{len(keys)} != {_bs}"
+            batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}
+            assert len(batch.keys()) == 1
+
+            try:
+                if sim_chunk_length == 0:
+                    # N-best list of (text, token, token_int, hyp_object)
+                    results = speech2text(**batch)
+                else:
+                    speech = batch["speech"]
+                    if (len(speech) // sim_chunk_length) > 1:
+                        for i in range(len(speech) // sim_chunk_length):
+                            speech2text(
+                                speech=speech[
+                                    i * sim_chunk_length : (i + 1) * sim_chunk_length
+                                ],
+                                is_final=False,
+                            )
+                        results = speech2text(
+                            speech[(i + 1) * sim_chunk_length : len(speech)],
+                            is_final=True,
+                        )
+                    else:
+                        results = speech2text(**batch)
+
+            except TooShortUttError as e:
+                logging.warning(f"Utterance {keys} {e}")
+                hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
+                results = [[" ", ["<space>"], [2], hyp]] * nbest
+
+            # Only supporting batch_size==1
+            key = keys[0]
+            for n, (text, token, token_int, hyp) in zip(range(1, nbest + 1), results):
+                # Create a directory: outdir/{n}best_recog
+                ibest_writer = writer[f"{n}best_recog"]
+
+                # Write the result to each file
+                ibest_writer["token"][key] = " ".join(token)
+                ibest_writer["token_int"][key] = " ".join(map(str, token_int))
+                ibest_writer["score"][key] = str(hyp.score)
+
+                if text is not None:
+                    ibest_writer["text"][key] = text
+
+
+def get_parser():
+    parser = config_argparse.ArgumentParser(
+        description="ST Decoding",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    # Note(kamo): Use '_' instead of '-' as separator.
+    # '-' is confusing if written in yaml.
+    parser.add_argument(
+        "--log_level",
+        type=lambda x: x.upper(),
+        default="INFO",
+        choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
+        help="The verbose level of logging",
+    )
+
+    parser.add_argument("--output_dir", type=str, required=True)
+    parser.add_argument(
+        "--ngpu",
+        type=int,
+        default=0,
+        help="The number of gpus. 0 indicates CPU mode",
+    )
+    parser.add_argument("--seed", type=int, default=0, help="Random seed")
+    parser.add_argument(
+        "--dtype",
+        default="float32",
+        choices=["float16", "float32", "float64"],
+        help="Data type",
+    )
+    parser.add_argument(
+        "--num_workers",
+        type=int,
+        default=1,
+        help="The number of workers used for DataLoader",
+    )
+
+    group = parser.add_argument_group("Input data related")
+    group.add_argument(
+        "--data_path_and_name_and_type",
+        type=str2triple_str,
+        required=True,
+        action="append",
+    )
+    group.add_argument("--key_file", type=str_or_none)
+    group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
+    group.add_argument(
+        "--sim_chunk_length",
+        type=int,
+        default=0,
+        help="The length of one chunk, to which speech will be "
+        "divided for evalution of streaming processing.",
+    )
+
+    group = parser.add_argument_group("The model configuration related")
+    group.add_argument("--st_train_config", type=str, required=True)
+    group.add_argument("--st_model_file", type=str, required=True)
+    group.add_argument("--lm_train_config", type=str)
+    group.add_argument("--lm_file", type=str)
+    group.add_argument("--word_lm_train_config", type=str)
+    group.add_argument("--word_lm_file", type=str)
+
+    group = parser.add_argument_group("Beam-search related")
+    group.add_argument(
+        "--batch_size",
+        type=int,
+        default=1,
+        help="The batch size for inference",
+    )
+    group.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
+    group.add_argument("--beam_size", type=int, default=20, help="Beam size")
+    group.add_argument("--penalty", type=float, default=0.0, help="Insertion penalty")
+    group.add_argument(
+        "--maxlenratio",
+        type=float,
+        default=0.0,
+        help="Input length ratio to obtain max output length. "
+        "If maxlenratio=0.0 (default), it uses a end-detect "
+        "function "
+        "to automatically find maximum hypothesis lengths",
+    )
+    group.add_argument(
+        "--minlenratio",
+        type=float,
+        default=0.0,
+        help="Input length ratio to obtain min output length",
+    )
+    group.add_argument("--lm_weight", type=float, default=1.0, help="RNNLM weight")
+    group.add_argument("--disable_repetition_detection", type=str2bool, default=False)
+
+    group.add_argument(
+        "--encoded_feat_length_limit",
+        type=int,
+        default=0,
+        help="Limit the lengths of the encoded feature" "to input to the decoder.",
+    )
+    group.add_argument(
+        "--decoder_text_length_limit",
+        type=int,
+        default=0,
+        help="Limit the lengths of the text" "to input to the decoder.",
+    )
+
+    group = parser.add_argument_group("Text converter related")
+    group.add_argument(
+        "--token_type",
+        type=str_or_none,
+        default=None,
+        choices=["char", "bpe", None],
+        help="The token type for ST model. "
+        "If not given, refers from the training args",
+    )
+    group.add_argument(
+        "--bpemodel",
+        type=str_or_none,
+        default=None,
+        help="The model path of sentencepiece. "
+        "If not given, refers from the training args",
+    )
+
+    return parser
+
+
+def main(cmd=None):
+    print(get_commandline_args(), file=sys.stderr)
+    parser = get_parser()
+    args = parser.parse_args(cmd)
+    kwargs = vars(args)
+    kwargs.pop("config", None)
+    inference(**kwargs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/espnet2/bin/tts_inference.py b/espnet2/bin/tts_inference.py
index 338ce8a016b..683074d2eb0 100755
--- a/espnet2/bin/tts_inference.py
+++ b/espnet2/bin/tts_inference.py
@@ -92,6 +92,7 @@ def __init__(
         device: str = "cpu",
         seed: int = 777,
         always_fix_seed: bool = False,
+        prefer_normalized_feats: bool = False,
     ):
         """Initialize Text2Speech module."""
         assert check_argument_types()
@@ -114,6 +115,7 @@ def __init__(
         self.seed = seed
         self.always_fix_seed = always_fix_seed
         self.vocoder = None
+        self.prefer_normalized_feats = prefer_normalized_feats
         if self.tts.require_vocoder:
             vocoder = TTSTask.build_vocoder_from_file(
                 vocoder_config, vocoder_file, model, device
@@ -209,10 +211,13 @@ def __call__(
 
         # apply vocoder (mel-to-wav)
         if self.vocoder is not None:
-            if output_dict.get("feat_gen_denorm") is not None:
-                input_feat = output_dict["feat_gen_denorm"]
-            else:
+            if (
+                self.prefer_normalized_feats
+                or output_dict.get("feat_gen_denorm") is None
+            ):
                 input_feat = output_dict["feat_gen"]
+            else:
+                input_feat = output_dict["feat_gen_denorm"]
             wav = self.vocoder(input_feat)
             output_dict.update(wav=wav)
 
diff --git a/espnet2/diar/espnet_model.py b/espnet2/diar/espnet_model.py
index 8a59b3cb5a3..1e1d10af15e 100644
--- a/espnet2/diar/espnet_model.py
+++ b/espnet2/diar/espnet_model.py
@@ -78,6 +78,7 @@ def forward(
         speech_lengths: torch.Tensor = None,
         spk_labels: torch.Tensor = None,
         spk_labels_lengths: torch.Tensor = None,
+        **kwargs,
     ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
         """Frontend + Encoder + Decoder + Calc loss
 
@@ -89,6 +90,7 @@ def forward(
                                      see in
                                      espnet2/iterators/chunk_iter_factory.py
             spk_labels: (Batch, )
+            kwargs: "utt_id" is among the input.
         """
         assert speech.shape[0] == spk_labels.shape[0], (speech.shape, spk_labels.shape)
         batch_size = speech.shape[0]
@@ -191,6 +193,7 @@ def collect_feats(
         speech_lengths: torch.Tensor,
         spk_labels: torch.Tensor = None,
         spk_labels_lengths: torch.Tensor = None,
+        **kwargs,
     ) -> Dict[str, torch.Tensor]:
         feats, feats_lengths = self._extract_feats(speech, speech_lengths)
         return {"feats": feats, "feats_lengths": feats_lengths}
diff --git a/espnet2/enh/encoder/conv_encoder.py b/espnet2/enh/encoder/conv_encoder.py
index c46f31323e5..c70ccfc164b 100644
--- a/espnet2/enh/encoder/conv_encoder.py
+++ b/espnet2/enh/encoder/conv_encoder.py
@@ -34,7 +34,7 @@ def forward(self, input: torch.Tensor, ilens: torch.Tensor):
         Returns:
             feature (torch.Tensor): mixed feature after encoder [Batch, flens, channel]
         """
-        assert input.dim() == 2, "Currently only support single channle input"
+        assert input.dim() == 2, "Currently only support single channel input"
 
         input = torch.unsqueeze(input, 1)
 
diff --git a/espnet2/enh/espnet_enh_s2t_model.py b/espnet2/enh/espnet_enh_s2t_model.py
new file mode 100644
index 00000000000..51746f9fbc1
--- /dev/null
+++ b/espnet2/enh/espnet_enh_s2t_model.py
@@ -0,0 +1,274 @@
+from contextlib import contextmanager
+from distutils.version import LooseVersion
+import logging
+import random
+from typing import Dict
+from typing import List
+from typing import Tuple
+from typing import Union
+
+import torch
+from typeguard import check_argument_types
+
+from espnet2.asr.espnet_model import ESPnetASRModel
+from espnet2.enh.espnet_model import ESPnetEnhancementModel
+from espnet2.st.espnet_model import ESPnetSTModel
+from espnet2.torch_utils.device_funcs import force_gatherable
+from espnet2.train.abs_espnet_model import AbsESPnetModel
+
+if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"):
+    from torch.cuda.amp import autocast
+else:
+    # Nothing to do if torch<1.6.0
+    @contextmanager
+    def autocast(enabled=True):
+        yield
+
+
+class ESPnetEnhS2TModel(AbsESPnetModel):
+    """Joint model Enhancement and Speech to Text."""
+
+    def __init__(
+        self,
+        enh_model: ESPnetEnhancementModel,
+        s2t_model: Union[ESPnetASRModel, ESPnetSTModel],
+        calc_enh_loss: bool = True,
+        bypass_enh_prob: float = 0,  # 0 means do not bypass enhancement for all data
+    ):
+        assert check_argument_types()
+
+        super().__init__()
+        self.enh_model = enh_model
+        self.s2t_model = s2t_model  # ASR or ST model
+
+        self.bypass_enh_prob = bypass_enh_prob
+
+        self.calc_enh_loss = calc_enh_loss
+        self.extract_feats_in_collect_stats = (
+            self.s2t_model.extract_feats_in_collect_stats
+        )
+
+    def forward(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
+        """Frontend + Encoder + Decoder + Calc loss
+
+        Args:
+            speech: (Batch, Length, ...)
+            speech_lengths: (Batch, )
+            text: (Batch, Length)
+            text_lengths: (Batch,)
+        """
+        assert text_lengths.dim() == 1, text_lengths.shape
+        # Check that batch_size is unified
+        assert (
+            speech.shape[0]
+            == speech_lengths.shape[0]
+            == text.shape[0]
+            == text_lengths.shape[0]
+        ), (speech.shape, speech_lengths.shape, text.shape, text_lengths.shape)
+
+        # additional checks with valid src_text
+        if "src_text" in kwargs:
+            src_text = kwargs["src_text"]
+            src_text_lengths = kwargs["src_text_lengths"]
+
+            if src_text is not None:
+                assert src_text_lengths.dim() == 1, src_text_lengths.shape
+                assert (
+                    text.shape[0] == src_text.shape[0] == src_text_lengths.shape[0]
+                ), (
+                    text.shape,
+                    src_text.shape,
+                    src_text_lengths.shape,
+                )
+        else:
+            src_text = None
+            src_text_lengths = None
+
+        batch_size = speech.shape[0]
+
+        # clean speech signal
+        speech_ref = None
+        if self.calc_enh_loss:
+            assert "speech_ref1" in kwargs
+            speech_ref = [kwargs["speech_ref1"]]  # [(Batch, samples)] x num_spkr
+
+        # Calculating enhancement loss
+        utt_id = kwargs.get("utt_id", None)
+        bypass_enh_flag, skip_enhloss_flag = False, False
+        if utt_id is not None:
+            # TODO(xkc): to pass category info and use predefined category list
+            if utt_id[0].endswith("SIMU"):
+                # For simulated single-/multi-speaker data
+                # feed it to Enhancement and calculate loss_enh
+                bypass_enh_flag = False
+                skip_enhloss_flag = False
+            elif utt_id[0].endswith("REAL"):
+                # For single-speaker real data
+                # feed it to Enhancement but without calculating loss_enh
+                bypass_enh_flag = False
+                skip_enhloss_flag = True
+            else:
+                # For clean data
+                # feed it to Enhancement, without calculating loss_enh
+                bypass_enh_flag = True
+                skip_enhloss_flag = True
+
+        if not self.calc_enh_loss:
+            skip_enhloss_flag = True
+
+        # Bypass the enhancement module
+        if (
+            self.training and skip_enhloss_flag and not bypass_enh_flag
+        ):  # For single-speaker real data: possibility to bypass frontend
+            if random.random() <= self.bypass_enh_prob:
+                bypass_enh_flag = True
+
+        # 1. Enhancement
+        # model forward
+        loss_enh = None
+        if not bypass_enh_flag:
+            (
+                speech_pre,
+                feature_mix,
+                feature_pre,
+                others,
+            ) = self.enh_model.forward_enhance(speech, speech_lengths)
+            # loss computation
+            if not skip_enhloss_flag:
+                loss_enh, _, _ = self.enh_model.forward_loss(
+                    speech_pre,
+                    speech_lengths,
+                    feature_mix,
+                    feature_pre,
+                    others,
+                    speech_ref,
+                )
+                loss_enh = loss_enh[0]
+        else:
+            speech_pre = [speech]
+
+        # for data-parallel
+        text = text[:, : text_lengths.max()]
+        if src_text is not None:
+            src_text = src_text[:, : src_text_lengths.max()]
+
+        # 2. ASR or ST
+        if isinstance(self.s2t_model, ESPnetASRModel):  # ASR
+            loss_asr, stats, weight = self.s2t_model(
+                speech_pre[0], speech_lengths, text, text_lengths
+            )
+        elif isinstance(self.s2t_model, ESPnetSTModel):  # ST
+            loss_asr, stats, weight = self.s2t_model(
+                speech_pre[0],
+                speech_lengths,
+                text,
+                text_lengths,
+                src_text,
+                src_text_lengths,
+            )
+        else:
+            raise NotImplementedError(f"{type(self.s2t_model)} is not supported yet.")
+
+        if loss_enh is not None:
+            loss = loss_enh + loss_asr
+        else:
+            loss = loss_asr
+
+        stats["loss"] = loss.detach() if loss is not None else None
+        stats["loss_enh"] = loss_enh.detach() if loss_enh is not None else None
+
+        # force_gatherable: to-device and to-tensor if scalar for DataParallel
+        loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
+        return loss, stats, weight
+
+    def collect_feats(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+        **kwargs,
+    ) -> Dict[str, torch.Tensor]:
+        if self.extract_feats_in_collect_stats:
+            ret = self.s2t_model.collect_feats(
+                speech,
+                speech_lengths,
+                text,
+                text_lengths,
+                **kwargs,
+            )
+            feats, feats_lengths = ret["feats"], ret["feats_lengths"]
+        else:
+            # Generate dummy stats if extract_feats_in_collect_stats is False
+            logging.warning(
+                "Generating dummy stats for feats and feats_lengths, "
+                "because encoder_conf.extract_feats_in_collect_stats is "
+                f"{self.extract_feats_in_collect_stats}"
+            )
+            feats, feats_lengths = speech, speech_lengths
+        return {"feats": feats, "feats_lengths": feats_lengths}
+
+    def encode(
+        self, speech: torch.Tensor, speech_lengths: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Frontend + Encoder. Note that this method is used by asr_inference.py
+
+        Args:
+            speech: (Batch, Length, ...)
+            speech_lengths: (Batch, )
+        """
+        speech_pre, feature_mix, feature_pre, others = self.enh_model.forward_enhance(
+            speech, speech_lengths
+        )
+        encoder_out, encoder_out_lens = self.s2t_model.encode(
+            speech_pre[0], speech_lengths
+        )
+
+        return encoder_out, encoder_out_lens
+
+    def nll(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_out_lens: torch.Tensor,
+        ys_pad: torch.Tensor,
+        ys_pad_lens: torch.Tensor,
+    ) -> torch.Tensor:
+        """Compute negative log likelihood(nll) from transformer-decoder
+
+        Normally, this function is called in batchify_nll.
+
+        Args:
+            encoder_out: (Batch, Length, Dim)
+            encoder_out_lens: (Batch,)
+            ys_pad: (Batch, Length)
+            ys_pad_lens: (Batch,)
+        """
+        return self.s2t_model.nll(
+            encoder_out,
+            encoder_out_lens,
+            ys_pad,
+            ys_pad_lens,
+        )
+
+    batchify_nll = ESPnetASRModel.batchify_nll
+
+    def inherite_attributes(
+        self,
+        inherite_enh_attrs: List[str] = [],
+        inherite_s2t_attrs: List[str] = [],
+    ):
+        assert check_argument_types()
+
+        if len(inherite_enh_attrs) > 0:
+            for attr in inherite_enh_attrs:
+                setattr(self, attr, getattr(self.enh_model, attr, None))
+        if len(inherite_s2t_attrs) > 0:
+            for attr in inherite_s2t_attrs:
+                setattr(self, attr, getattr(self.s2t_model, attr, None))
diff --git a/espnet2/enh/espnet_model.py b/espnet2/enh/espnet_model.py
index f9824471604..75bb57094f4 100644
--- a/espnet2/enh/espnet_model.py
+++ b/espnet2/enh/espnet_model.py
@@ -3,6 +3,7 @@
 from typing import Dict
 from typing import List
 from typing import Optional
+from typing import OrderedDict
 from typing import Tuple
 
 import torch
@@ -14,6 +15,7 @@
 from espnet2.enh.loss.criterions.time_domain import TimeDomainLoss
 from espnet2.enh.loss.wrappers.abs_wrapper import AbsLossWrapper
 from espnet2.enh.separator.abs_separator import AbsSeparator
+from espnet2.enh.separator.dan_separator import DANSeparator
 from espnet2.torch_utils.device_funcs import force_gatherable
 from espnet2.train.abs_espnet_model import AbsESPnetModel
 
@@ -77,6 +79,7 @@ def forward(
                             because the chunk-iterator does not have the
                             speech_lengths returned. see in
                             espnet2/iterators/chunk_iter_factory.py
+            kwargs: "utt_id" is among the input.
         """
         # clean speech signal of each speaker
         speech_ref = [
@@ -86,8 +89,8 @@ def forward(
         speech_ref = torch.stack(speech_ref, dim=1)
 
         if "noise_ref1" in kwargs:
-            # noise signal (optional, required when using
-            # frontend models with beamformering)
+            # noise signal (optional, required when using beamforming-based
+            # frontend models)
             noise_ref = [
                 kwargs["noise_ref{}".format(n + 1)] for n in range(self.num_noise_type)
             ]
@@ -132,19 +135,60 @@ def forward(
         # for data-parallel
         speech_ref = speech_ref[..., : speech_lengths.max()]
         speech_ref = speech_ref.unbind(dim=1)
+        additional = {}
+        # Additional data is required in Deep Attractor Network
+        if isinstance(self.separator, DANSeparator):
+            additional["feature_ref"] = [
+                self.encoder(r, speech_lengths)[0] for r in speech_ref
+            ]
 
         speech_mix = speech_mix[:, : speech_lengths.max()]
 
         # model forward
+        speech_pre, feature_mix, feature_pre, others = self.forward_enhance(
+            speech_mix, speech_lengths, additional
+        )
+
+        # loss computation
+        loss, stats, weight = self.forward_loss(
+            speech_pre,
+            speech_lengths,
+            feature_mix,
+            feature_pre,
+            others,
+            speech_ref,
+            noise_ref,
+            dereverb_speech_ref,
+        )
+        return loss, stats, weight
+
+    def forward_enhance(
+        self,
+        speech_mix: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        additional: Optional[Dict] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         feature_mix, flens = self.encoder(speech_mix, speech_lengths)
-        feature_pre, flens, others = self.separator(feature_mix, flens)
+        feature_pre, flens, others = self.separator(feature_mix, flens, additional)
         if feature_pre is not None:
             speech_pre = [self.decoder(ps, speech_lengths)[0] for ps in feature_pre]
         else:
             # some models (e.g. neural beamformer trained with mask loss)
             # do not predict time-domain signal in the training stage
             speech_pre = None
+        return speech_pre, feature_mix, feature_pre, others
 
+    def forward_loss(
+        self,
+        speech_pre: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        feature_mix: torch.Tensor,
+        feature_pre: torch.Tensor,
+        others: OrderedDict,
+        speech_ref: torch.Tensor,
+        noise_ref: torch.Tensor = None,
+        dereverb_speech_ref: torch.Tensor = None,
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
         loss = 0.0
         stats = dict()
         o = {}
@@ -156,14 +200,19 @@ def forward(
                     # only select one channel as the reference
                     speech_ref = [sr[..., self.ref_channel] for sr in speech_ref]
                 # for the time domain criterions
-                l, s, o = loss_wrapper(speech_ref, speech_pre, o)
+                l, s, o = loss_wrapper(speech_ref, speech_pre, others)
             elif isinstance(criterion, FrequencyDomainLoss):
                 # for the time-frequency domain criterions
                 if criterion.compute_on_mask:
-                    # compute on mask
+                    # compute loss on masks
+                    if noise_ref is not None:
+                        noise_spec = self.encoder(noise_ref.sum(1), speech_lengths)[0]
+                    else:
+                        noise_spec = None
                     tf_ref = criterion.create_mask_label(
                         feature_mix,
                         [self.encoder(sr, speech_lengths)[0] for sr in speech_ref],
+                        noise_spec=noise_spec,
                     )
                     tf_pre = [
                         others["mask_spk{}".format(spk + 1)]
@@ -178,13 +227,17 @@ def forward(
                     tf_ref = [self.encoder(sr, speech_lengths)[0] for sr in speech_ref]
                     tf_pre = feature_pre
 
-                l, s, o = loss_wrapper(tf_ref, tf_pre, o)
+                l, s, o = loss_wrapper(tf_ref, tf_pre, others)
+            else:
+                raise NotImplementedError("Unsupported loss type: %s" % str(criterion))
+
             loss += l * loss_wrapper.weight
             stats.update(s)
 
         stats["loss"] = loss.detach()
 
         # force_gatherable: to-device and to-tensor if scalar for DataParallel
+        batch_size = speech_ref[0].shape[0]
         loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
         return loss, stats, weight
 
diff --git a/espnet2/enh/layers/beamformer.py b/espnet2/enh/layers/beamformer.py
index 1e72b99071f..e3d61d2489f 100644
--- a/espnet2/enh/layers/beamformer.py
+++ b/espnet2/enh/layers/beamformer.py
@@ -443,7 +443,7 @@ def get_sdw_mwf_vector(
         filtering for noise reduction; A. Spriet et al, 2004
         https://dl.acm.org/doi/abs/10.1016/j.sigpro.2004.07.028
         [2] Rank-1 constrained multichannel Wiener filter for speech recognition in
-        noisy environments; Z. Wangyou et al, 2018
+        noisy environments; Z. Wang et al, 2018
         https://hal.inria.fr/hal-01634449/document
         [3] Low-rank approximation based multichannel Wiener filter algorithms for
         noise reduction with application in cochlear implants; R. Serizel, 2014
@@ -525,7 +525,7 @@ def get_rank1_mwf_vector(
 
     Reference:
         [1] Rank-1 constrained multichannel Wiener filter for speech recognition in
-        noisy environments; Z. Wangyou et al, 2018
+        noisy environments; Z. Wang et al, 2018
         https://hal.inria.fr/hal-01634449/document
         [2] Low-rank approximation based multichannel Wiener filter algorithms for
         noise reduction with application in cochlear implants; R. Serizel, 2014
diff --git a/espnet2/enh/layers/complex_utils.py b/espnet2/enh/layers/complex_utils.py
index bf4799f58d8..acfbe2f61a8 100644
--- a/espnet2/enh/layers/complex_utils.py
+++ b/espnet2/enh/layers/complex_utils.py
@@ -72,9 +72,12 @@ def complex_norm(
     if is_torch_complex_tensor(c):
         return torch.norm(c, dim=dim, keepdim=keepdim)
     else:
-        return torch.sqrt(
-            (c.real**2 + c.imag**2).sum(dim=dim, keepdim=keepdim) + EPS
-        )
+        if dim is None:
+            return torch.sqrt((c.real**2 + c.imag**2).sum() + EPS)
+        else:
+            return torch.sqrt(
+                (c.real**2 + c.imag**2).sum(dim=dim, keepdim=keepdim) + EPS
+            )
 
 
 def einsum(equation, *operands):
diff --git a/espnet2/enh/layers/dpmulcat.py b/espnet2/enh/layers/dpmulcat.py
new file mode 100644
index 00000000000..60d6a54ceb1
--- /dev/null
+++ b/espnet2/enh/layers/dpmulcat.py
@@ -0,0 +1,189 @@
+import torch
+import torch.nn as nn
+
+
+class MulCatBlock(nn.Module):
+    """The MulCat block.
+
+    Args:
+        input_size: int, dimension of the input feature.
+            The input should have shape (batch, seq_len, input_size).
+        hidden_size: int, dimension of the hidden state.
+        dropout: float, the dropout rate in the LSTM layer. (Default: 0.0)
+        bidirectional: bool, whether the RNN layers are bidirectional. (Default: True)
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        hidden_size: int,
+        dropout: float = 0.0,
+        bidirectional: bool = True,
+    ):
+        super().__init__()
+
+        num_direction = int(bidirectional) + 1
+
+        self.rnn = nn.LSTM(
+            input_size,
+            hidden_size,
+            1,
+            dropout=dropout,
+            batch_first=True,
+            bidirectional=bidirectional,
+        )
+        self.rnn_proj = nn.Linear(hidden_size * num_direction, input_size)
+
+        self.gate_rnn = nn.LSTM(
+            input_size,
+            hidden_size,
+            num_layers=1,
+            batch_first=True,
+            dropout=dropout,
+            bidirectional=bidirectional,
+        )
+        self.gate_rnn_proj = nn.Linear(hidden_size * num_direction, input_size)
+
+        self.block_projection = nn.Linear(input_size * 2, input_size)
+
+    def forward(self, input):
+        """Compute output after MulCatBlock.
+
+        Args:
+            input (torch.Tensor): The input feature.
+                Tensor of shape (batch, time, feature_dim)
+
+        Returns:
+            (torch.Tensor): The output feature after MulCatBlock.
+                Tensor of shape (batch, time, feature_dim)
+        """
+        orig_shape = input.shape
+        # run rnn module
+        rnn_output, _ = self.rnn(input)
+        rnn_output = (
+            self.rnn_proj(rnn_output.contiguous().view(-1, rnn_output.shape[2]))
+            .view(orig_shape)
+            .contiguous()
+        )
+        # run gate rnn module
+        gate_rnn_output, _ = self.gate_rnn(input)
+        gate_rnn_output = (
+            self.gate_rnn_proj(
+                gate_rnn_output.contiguous().view(-1, gate_rnn_output.shape[2])
+            )
+            .view(orig_shape)
+            .contiguous()
+        )
+        # apply gated rnn
+        gated_output = torch.mul(rnn_output, gate_rnn_output)
+        # concatenate the input with rnn output
+        gated_output = torch.cat([gated_output, input], 2)
+        # linear projection to make the output shape the same as input
+        gated_output = self.block_projection(
+            gated_output.contiguous().view(-1, gated_output.shape[2])
+        ).view(orig_shape)
+        return gated_output
+
+
+class DPMulCat(nn.Module):
+    """Dual-path RNN module with MulCat blocks.
+
+    Args:
+        input_size: int, dimension of the input feature.
+            The input should have shape (batch, seq_len, input_size).
+        hidden_size: int, dimension of the hidden state.
+        output_size: int, dimension of the output size.
+        num_spk: int, the number of speakers in the output.
+        dropout: float, the dropout rate in the LSTM layer. (Default: 0.0)
+        bidirectional: bool, whether the RNN layers are bidirectional. (Default: True)
+        num_layers: int, number of stacked MulCat blocks. (Default: 4)
+        input_normalize: bool, whether to apply GroupNorm on the input Tensor.
+            (Default: False)
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        hidden_size: int,
+        output_size: int,
+        num_spk: int,
+        dropout: float = 0.0,
+        num_layers: int = 4,
+        bidirectional: bool = True,
+        input_normalize: bool = False,
+    ):
+        super().__init__()
+
+        self.rows_grnn = nn.ModuleList([])
+        self.cols_grnn = nn.ModuleList([])
+        self.rows_normalization = nn.ModuleList([])
+        self.cols_normalization = nn.ModuleList([])
+
+        # create the dual path pipeline
+        for i in range(num_layers):
+            self.rows_grnn.append(
+                MulCatBlock(
+                    input_size, hidden_size, dropout, bidirectional=bidirectional
+                )
+            )
+            self.cols_grnn.append(
+                MulCatBlock(
+                    input_size, hidden_size, dropout, bidirectional=bidirectional
+                )
+            )
+            if input_normalize:
+                self.rows_normalization.append(nn.GroupNorm(1, input_size, eps=1e-8))
+                self.cols_normalization.append(nn.GroupNorm(1, input_size, eps=1e-8))
+            else:
+                # used to disable normalization
+                self.rows_normalization.append(nn.Identity())
+                self.cols_normalization.append(nn.Identity())
+
+        self.output = nn.Sequential(
+            nn.PReLU(), nn.Conv2d(input_size, output_size * num_spk, 1)
+        )
+
+    def forward(self, input):
+        """Compute output after DPMulCat module.
+
+        Args:
+            input (torch.Tensor): The input feature.
+                Tensor of shape (batch, N, dim1, dim2)
+                Apply RNN on dim1 first and then dim2
+
+        Returns:
+            (list(torch.Tensor) or list(list(torch.Tensor))
+                In training mode, the module returns output of each DPMulCat block.
+                In eval mode, the module only returns output in the last block.
+        """
+        batch_size, _, d1, d2 = input.shape
+        output = input
+        output_all = []
+        for i in range(len(self.rows_grnn)):
+            row_input = (
+                output.permute(0, 3, 2, 1).contiguous().view(batch_size * d2, d1, -1)
+            )
+            row_output = self.rows_grnn[i](row_input)
+            row_output = (
+                row_output.view(batch_size, d2, d1, -1).permute(0, 3, 2, 1).contiguous()
+            )
+            row_output = self.rows_normalization[i](row_output)
+            # apply a skip connection
+            output = output + row_output
+            col_input = (
+                output.permute(0, 2, 3, 1).contiguous().view(batch_size * d1, d2, -1)
+            )
+            col_output = self.cols_grnn[i](col_input)
+            col_output = (
+                col_output.view(batch_size, d1, d2, -1).permute(0, 3, 1, 2).contiguous()
+            )
+            col_output = self.cols_normalization[i](col_output).contiguous()
+            # apply a skip connection
+            output = output + col_output
+
+            # if training mode, it returns the output Tensor from all layers.
+            # Otherwise, it only returns the one from the last layer.
+            if self.training or i == (len(self.rows_grnn) - 1):
+                output_i = self.output(output)
+                output_all.append(output_i)
+        return output_all
diff --git a/espnet2/enh/loss/criterions/abs_loss.py b/espnet2/enh/loss/criterions/abs_loss.py
index 11f8482fe05..c09119c9e07 100644
--- a/espnet2/enh/loss/criterions/abs_loss.py
+++ b/espnet2/enh/loss/criterions/abs_loss.py
@@ -8,6 +8,7 @@
 
 
 class AbsEnhLoss(torch.nn.Module, ABC):
+    """Base class for all Enhancement loss modules."""
 
     # the name will be the key that appears in the reporter
     @property
diff --git a/espnet2/enh/loss/criterions/tf_domain.py b/espnet2/enh/loss/criterions/tf_domain.py
index a4d78bfa2f2..c94678e4244 100644
--- a/espnet2/enh/loss/criterions/tf_domain.py
+++ b/espnet2/enh/loss/criterions/tf_domain.py
@@ -2,9 +2,12 @@
 from abc import abstractmethod
 from distutils.version import LooseVersion
 from functools import reduce
+import math
 
 import torch
+import torch.nn.functional as F
 
+from espnet2.enh.layers.complex_utils import complex_norm
 from espnet2.enh.layers.complex_utils import is_complex
 from espnet2.enh.layers.complex_utils import new_complex_like
 from espnet2.enh.loss.criterions.abs_loss import AbsEnhLoss
@@ -15,12 +18,14 @@
 EPS = torch.finfo(torch.get_default_dtype()).eps
 
 
-def _create_mask_label(mix_spec, ref_spec, mask_type="IAM"):
+def _create_mask_label(mix_spec, ref_spec, noise_spec=None, mask_type="IAM"):
     """Create mask label.
 
     Args:
         mix_spec: ComplexTensor(B, T, [C,] F)
         ref_spec: List[ComplexTensor(B, T, [C,] F), ...]
+        noise_spec: ComplexTensor(B, T, [C,] F)
+            only used for IBM and IRM
         mask_type: str
     Returns:
         labels: List[Tensor(B, T, [C,] F), ...] or List[ComplexTensor(B, T, F), ...]
@@ -38,16 +43,24 @@ def _create_mask_label(mix_spec, ref_spec, mask_type="IAM"):
         "CIRM",
     ], f"mask type {mask_type} not supported"
     mask_label = []
-    for r in ref_spec:
+    if ref_spec[0].ndim < mix_spec.ndim:
+        # (B, T, F) -> (B, T, 1, F)
+        ref_spec = [r.unsqueeze(2).expand_as(mix_spec.real) for r in ref_spec]
+    for idx, r in enumerate(ref_spec):
         mask = None
         if mask_type == "IBM":
-            flags = [abs(r) >= abs(n) for n in ref_spec]
+            if noise_spec is None:
+                flags = [abs(r) >= abs(n) for n in ref_spec]
+            else:
+                flags = [abs(r) >= abs(n) for n in ref_spec + [noise_spec]]
             mask = reduce(lambda x, y: x * y, flags)
             mask = mask.int()
         elif mask_type == "IRM":
-            # TODO(Wangyou): need to fix this,
-            #  as noise referecens are provided separately
-            mask = abs(r) / (sum(([abs(n) for n in ref_spec])) + EPS)
+            beta = 0.5
+            res_spec = sum(n for i, n in enumerate(ref_spec) if i != idx)
+            if noise_spec is not None:
+                res_spec += noise_spec
+            mask = (abs(r).pow(2) / (abs(res_spec).pow(2) + EPS)).pow(beta)
         elif mask_type == "IAM":
             mask = abs(r) / (abs(mix_spec) + EPS)
             mask = mask.clamp(min=0, max=1)
@@ -82,6 +95,7 @@ def _create_mask_label(mix_spec, ref_spec, mask_type="IAM"):
 
 
 class FrequencyDomainLoss(AbsEnhLoss, ABC):
+    """Base class for all frequence-domain Enhancement loss modules."""
 
     # The loss will be computed on mask or on spectrum
     @property
@@ -95,18 +109,28 @@ def compute_on_mask() -> bool:
     def mask_type() -> str:
         pass
 
-    def create_mask_label(self, mix_spec, ref_spec):
+    def create_mask_label(self, mix_spec, ref_spec, noise_spec=None):
         return _create_mask_label(
-            mix_spec=mix_spec, ref_spec=ref_spec, mask_type=self.mask_type
+            mix_spec=mix_spec,
+            ref_spec=ref_spec,
+            noise_spec=noise_spec,
+            mask_type=self.mask_type,
         )
 
 
 class FrequencyDomainMSE(FrequencyDomainLoss):
-    def __init__(self, compute_on_mask=False, mask_type="IBM"):
+    def __init__(self, compute_on_mask=False, mask_type="IBM", name=None):
         super().__init__()
         self._compute_on_mask = compute_on_mask
         self._mask_type = mask_type
 
+        if name is not None:
+            self._name = name
+        elif self.compute_on_mask:
+            self._name = f"MSE_on_{self.mask_type}"
+        else:
+            self._name = "MSE_on_Spec"
+
     @property
     def compute_on_mask(self) -> bool:
         return self._compute_on_mask
@@ -117,10 +141,7 @@ def mask_type(self) -> str:
 
     @property
     def name(self) -> str:
-        if self.compute_on_mask:
-            return f"MSE_on_{self.mask_type}"
-        else:
-            return "MSE_on_Spec"
+        return self._name
 
     def forward(self, ref, inf) -> torch.Tensor:
         """time-frequency MSE loss.
@@ -150,11 +171,18 @@ def forward(self, ref, inf) -> torch.Tensor:
 
 
 class FrequencyDomainL1(FrequencyDomainLoss):
-    def __init__(self, compute_on_mask=False, mask_type="IBM"):
+    def __init__(self, compute_on_mask=False, mask_type="IBM", name=None):
         super().__init__()
         self._compute_on_mask = compute_on_mask
         self._mask_type = mask_type
 
+        if name is not None:
+            self._name = name
+        elif self.compute_on_mask:
+            self._name = f"L1_on_{self.mask_type}"
+        else:
+            self._name = "L1_on_Spec"
+
     @property
     def compute_on_mask(self) -> bool:
         return self._compute_on_mask
@@ -165,10 +193,7 @@ def mask_type(self) -> str:
 
     @property
     def name(self) -> str:
-        if self.compute_on_mask:
-            return f"L1_on_{self.mask_type}"
-        else:
-            return "L1_on_Spec"
+        return self._name
 
     def forward(self, ref, inf) -> torch.Tensor:
         """time-frequency L1 loss.
@@ -198,3 +223,219 @@ def forward(self, ref, inf) -> torch.Tensor:
                 "Invalid input shape: ref={}, inf={}".format(ref.shape, inf.shape)
             )
         return l1loss
+
+
+class FrequencyDomainDPCL(FrequencyDomainLoss):
+    def __init__(
+        self, compute_on_mask=False, mask_type="IBM", loss_type="dpcl", name=None
+    ):
+        super().__init__()
+        self._compute_on_mask = compute_on_mask
+        self._mask_type = mask_type
+        self._loss_type = loss_type
+        self._name = "dpcl" if name is None else name
+
+    @property
+    def compute_on_mask(self) -> bool:
+        return self._compute_on_mask
+
+    @property
+    def mask_type(self) -> str:
+        return self._mask_type
+
+    @property
+    def name(self) -> str:
+        return self._name
+
+    def forward(self, ref, inf) -> torch.Tensor:
+        """time-frequency Deep Clustering loss.
+
+        References:
+            [1] Deep clustering: Discriminative embeddings for segmentation and
+                separation; John R. Hershey. et al., 2016;
+                https://ieeexplore.ieee.org/document/7471631
+            [2] Manifold-Aware Deep Clustering: Maximizing Angles Between Embedding
+                Vectors Based on Regular Simplex; Tanaka, K. et al., 2021;
+                https://www.isca-speech.org/archive/interspeech_2021/tanaka21_interspeech.html
+
+        Args:
+            ref: List[(Batch, T, F) * spks]
+            inf: (Batch, T*F, D)
+        Returns:
+            loss: (Batch,)
+        """  # noqa: E501
+        assert len(ref) > 0
+        num_spk = len(ref)
+
+        # Compute the ref for Deep Clustering[1][2]
+        abs_ref = [abs(n) for n in ref]
+        if self._loss_type == "dpcl":
+            r = torch.zeros_like(abs_ref[0])
+            B = ref[0].shape[0]
+            for i in range(num_spk):
+                flags = [abs_ref[i] >= n for n in abs_ref]
+                mask = reduce(lambda x, y: x * y, flags)
+                mask = mask.int() * i
+                r += mask
+            r = r.contiguous().flatten().long()
+            re = F.one_hot(r, num_classes=num_spk)
+            re = re.contiguous().view(B, -1, num_spk)
+        elif self._loss_type == "mdc":
+            B = ref[0].shape[0]
+            manifold_vector = torch.full(
+                (num_spk, num_spk),
+                (-1 / num_spk) * math.sqrt(num_spk / (num_spk - 1)),
+                dtype=inf.dtype,
+                device=inf.device,
+            )
+            for i in range(num_spk):
+                manifold_vector[i][i] = ((num_spk - 1) / num_spk) * math.sqrt(
+                    num_spk / (num_spk - 1)
+                )
+
+            re = torch.zeros(
+                ref[0].shape[0],
+                ref[0].shape[1],
+                ref[0].shape[2],
+                num_spk,
+                device=inf.device,
+            )
+            for i in range(num_spk):
+                flags = [abs_ref[i] >= n for n in abs_ref]
+                mask = reduce(lambda x, y: x * y, flags)
+                mask = mask.int()
+                re[mask == 1] = manifold_vector[i]
+            re = re.contiguous().view(B, -1, num_spk)
+        else:
+            raise ValueError(
+                f"Invalid loss type error: {self._loss_type}, "
+                'the loss type must be "dpcl" or "mdc"'
+            )
+
+        V2 = torch.matmul(torch.transpose(inf, 2, 1), inf).pow(2).sum(dim=(1, 2))
+        Y2 = (
+            torch.matmul(torch.transpose(re, 2, 1).float(), re.float())
+            .pow(2)
+            .sum(dim=(1, 2))
+        )
+        VY = torch.matmul(torch.transpose(inf, 2, 1), re.float()).pow(2).sum(dim=(1, 2))
+
+        return V2 + Y2 - 2 * VY
+
+
+class FrequencyDomainAbsCoherence(FrequencyDomainLoss):
+    def __init__(self, compute_on_mask=False, mask_type=None, name=None):
+        super().__init__()
+        self._compute_on_mask = False
+        self._mask_type = None
+
+        self._name = "Coherence_on_Spec" if name is None else name
+
+    @property
+    def compute_on_mask(self) -> bool:
+        return self._compute_on_mask
+
+    @property
+    def mask_type(self) -> str:
+        return self._mask_type
+
+    @property
+    def name(self) -> str:
+        return self._name
+
+    def forward(self, ref, inf) -> torch.Tensor:
+        """time-frequency absolute coherence loss.
+
+        Reference:
+            Independent Vector Analysis with Deep Neural Network Source Priors;
+            Li et al 2020; https://arxiv.org/abs/2008.11273
+
+        Args:
+            ref: (Batch, T, F) or (Batch, T, C, F)
+            inf: (Batch, T, F) or (Batch, T, C, F)
+        Returns:
+            loss: (Batch,)
+        """
+        assert ref.shape == inf.shape, (ref.shape, inf.shape)
+
+        if is_complex(ref) and is_complex(inf):
+            # sqrt( E[|inf|^2] * E[|ref|^2] )
+            denom = (
+                complex_norm(ref, dim=1) * complex_norm(inf, dim=1) / ref.size(1) + EPS
+            )
+            coh = (inf * ref.conj()).mean(dim=1).abs() / denom
+            if ref.dim() == 3:
+                coh_loss = 1.0 - coh.mean(dim=1)
+            elif ref.dim() == 4:
+                coh_loss = 1.0 - coh.mean(dim=[1, 2])
+            else:
+                raise ValueError(
+                    "Invalid input shape: ref={}, inf={}".format(ref.shape, inf.shape)
+                )
+        else:
+            raise ValueError("`ref` and `inf` must be complex tensors.")
+        return coh_loss
+
+
+class FrequencyDomainCrossEntropy(FrequencyDomainLoss):
+    def __init__(self, compute_on_mask=False, mask_type=None, name=None):
+        super().__init__()
+        self._compute_on_mask = False
+        self._mask_type = None
+
+        if name is not None:
+            self._name = name
+        elif self.compute_on_mask:
+            self._name = f"CE_on_{self.mask_type}"
+        else:
+            self._name = "CE_on_Spec"
+
+    @property
+    def compute_on_mask(self) -> bool:
+        return self._compute_on_mask
+
+    @property
+    def mask_type(self) -> str:
+        return self._mask_type
+
+    @property
+    def name(self) -> str:
+        return self._name
+
+    def forward(self, ref, inf) -> torch.Tensor:
+        """time-frequency cross-entropy loss.
+
+        Args:
+            ref: (Batch, T) or (Batch, T, C)
+            inf: (Batch, T, nclass) or (Batch, T, C, nclass)
+        Returns:
+            loss: (Batch,)
+        """
+        assert ref.shape[0] == inf.shape[0] and ref.shape[1] == inf.shape[1], (
+            ref.shape,
+            inf.shape,
+        )
+
+        if ref.dim() == 2:
+            loss = torch.nn.functional.cross_entropy(
+                inf.permute(0, 2, 1), ref, reduction="none"
+            ).mean(dim=1)
+        elif ref.dim() == 3:
+            loss = torch.nn.functional.cross_entropy(
+                inf.permute(0, 3, 1, 2), ref, reduction="none"
+            ).mean(dim=[1, 2])
+        else:
+            raise ValueError(
+                "Invalid input shape: ref={}, inf={}".format(ref.shape, inf.shape)
+            )
+
+        with torch.no_grad():
+            pred = inf.argmax(-1)
+            acc = (pred == ref).float()
+            if ref.dim() == 2:
+                acc = acc.mean(dim=1)
+            elif ref.dim() == 3:
+                acc = acc.mean(dim=[1, 2])
+            self.stats = {"acc": acc.cpu() * 100}
+
+        return loss
diff --git a/espnet2/enh/loss/criterions/time_domain.py b/espnet2/enh/loss/criterions/time_domain.py
index 0ad369d3b44..d000b83fbbb 100644
--- a/espnet2/enh/loss/criterions/time_domain.py
+++ b/espnet2/enh/loss/criterions/time_domain.py
@@ -1,12 +1,17 @@
 from abc import ABC
+import logging
 
 import ci_sdr
+import fast_bss_eval
 import torch
 
+
 from espnet2.enh.loss.criterions.abs_loss import AbsEnhLoss
 
 
 class TimeDomainLoss(AbsEnhLoss, ABC):
+    """Base class for all time-domain Enhancement loss modules."""
+
     pass
 
 
@@ -30,13 +35,15 @@ class CISDRLoss(TimeDomainLoss):
         loss: (Batch,)
     """
 
-    def __init__(self, filter_length=512):
+    def __init__(self, filter_length=512, name=None):
         super().__init__()
         self.filter_length = filter_length
 
+        self._name = "ci_sdr_loss" if name is None else name
+
     @property
     def name(self) -> str:
-        return "ci_sdr_loss"
+        return self._name
 
     def forward(
         self,
@@ -52,13 +59,15 @@ def forward(
 
 
 class SNRLoss(TimeDomainLoss):
-    def __init__(self, eps=EPS):
+    def __init__(self, eps=EPS, name=None):
         super().__init__()
         self.eps = float(eps)
 
+        self._name = "snr_loss" if name is None else name
+
     @property
     def name(self) -> str:
-        return "snr_loss"
+        return self._name
 
     def forward(
         self,
@@ -76,47 +85,200 @@ def forward(
         return -snr
 
 
+class SDRLoss(TimeDomainLoss):
+    """SDR loss.
+
+    filter_length: int
+        The length of the distortion filter allowed (default: ``512``)
+    use_cg_iter:
+        If provided, an iterative method is used to solve for the distortion
+        filter coefficients instead of direct Gaussian elimination.
+        This can speed up the computation of the metrics in case the filters
+        are long. Using a value of 10 here has been shown to provide
+        good accuracy in most cases and is sufficient when using this
+        loss to train neural separation networks.
+    clamp_db: float
+        clamp the output value in  [-clamp_db, clamp_db]
+    zero_mean: bool
+        When set to True, the mean of all signals is subtracted prior.
+    load_diag:
+        If provided, this small value is added to the diagonal coefficients of
+        the system metrics when solving for the filter coefficients.
+        This can help stabilize the metric in the case where some of the reference
+        signals may sometimes be zero
+    """
+
+    def __init__(
+        self,
+        filter_length=512,
+        use_cg_iter=None,
+        clamp_db=None,
+        zero_mean=True,
+        load_diag=None,
+        name=None,
+    ):
+        super().__init__()
+
+        self.filter_length = filter_length
+        self.use_cg_iter = use_cg_iter
+        self.clamp_db = clamp_db
+        self.zero_mean = zero_mean
+        self.load_diag = load_diag
+
+        self._name = "sdr_loss" if name is None else name
+
+    @property
+    def name(self) -> str:
+        return self._name
+
+    def forward(
+        self,
+        ref: torch.Tensor,
+        est: torch.Tensor,
+    ) -> torch.Tensor:
+        """SDR forward.
+
+        Args:
+            ref: Tensor, (..., n_samples)
+                reference signal
+            est: Tensor (..., n_samples)
+                estimated signal
+
+        Returns:
+            loss: (...,)
+                the SDR loss (negative sdr)
+        """
+
+        sdr_loss = fast_bss_eval.sdr_loss(
+            est=est,
+            ref=ref,
+            filter_length=self.filter_length,
+            use_cg_iter=self.use_cg_iter,
+            zero_mean=self.zero_mean,
+            clamp_db=self.clamp_db,
+            load_diag=self.load_diag,
+            pairwise=False,
+        )
+
+        return sdr_loss
+
+
 class SISNRLoss(TimeDomainLoss):
-    def __init__(self, eps=EPS):
+    """SI-SNR (or named SI-SDR) loss
+
+    A more stable SI-SNR loss with clamp from `fast_bss_eval`.
+
+    Attributes:
+        clamp_db: float
+            clamp the output value in  [-clamp_db, clamp_db]
+        zero_mean: bool
+            When set to True, the mean of all signals is subtracted prior.
+        eps: float
+            Deprecated. Keeped for compatibility.
+    """
+
+    def __init__(self, clamp_db=None, zero_mean=True, eps=None, name=None):
         super().__init__()
-        self.eps = float(eps)
+        self.clamp_db = clamp_db
+        self.zero_mean = zero_mean
+        if eps is not None:
+            logging.warning("Eps is deprecated in si_snr loss, set clamp_db instead.")
+
+        self._name = "si_snr_loss" if name is None else name
 
     @property
     def name(self) -> str:
-        return "si_snr_loss"
+        return self._name
 
     def forward(
         self,
         ref: torch.Tensor,
-        inf: torch.Tensor,
+        est: torch.Tensor,
     ) -> torch.Tensor:
-        # the return tensor should be shape of (batch,)
-        assert ref.size() == inf.size()
-        B, T = ref.size()
-
-        # Step 1. Zero-mean norm
-        mean_target = torch.sum(ref, dim=1, keepdim=True) / T
-        mean_estimate = torch.sum(inf, dim=1, keepdim=True) / T
-        zero_mean_target = ref - mean_target
-        zero_mean_estimate = inf - mean_estimate
-
-        # Step 2. SI-SNR with order
-        # reshape to use broadcast
-        s_target = zero_mean_target  # [B, T]
-        s_estimate = zero_mean_estimate  # [B, T]
-        # s_target = <s', s>s / ||s||^2
-        pair_wise_dot = torch.sum(s_estimate * s_target, dim=1, keepdim=True)  # [B, 1]
-        s_target_energy = (
-            torch.sum(s_target**2, dim=1, keepdim=True) + self.eps
-        )  # [B, 1]
-        pair_wise_proj = pair_wise_dot * s_target / s_target_energy  # [B, T]
-        # e_noise = s' - s_target
-        e_noise = s_estimate - pair_wise_proj  # [B, T]
-
-        # SI-SNR = 10 * log_10(||s_target||^2 / ||e_noise||^2)
-        pair_wise_si_snr = torch.sum(pair_wise_proj**2, dim=1) / (
-            torch.sum(e_noise**2, dim=1) + self.eps
+        """SI-SNR forward.
+
+        Args:
+
+            ref: Tensor, (..., n_samples)
+                reference signal
+            est: Tensor (..., n_samples)
+                estimated signal
+
+        Returns:
+            loss: (...,)
+                the SI-SDR loss (negative si-sdr)
+        """
+
+        si_snr = fast_bss_eval.si_sdr_loss(
+            est=est,
+            ref=ref,
+            zero_mean=self.zero_mean,
+            clamp_db=self.clamp_db,
+            pairwise=False,
         )
-        pair_wise_si_snr = 10 * torch.log10(pair_wise_si_snr + self.eps)  # [B]
 
-        return -1 * pair_wise_si_snr
+        return si_snr
+
+
+class TimeDomainMSE(TimeDomainLoss):
+    def __init__(self, name=None):
+        super().__init__()
+        self._name = "TD_MSE_loss" if name is None else name
+
+    @property
+    def name(self) -> str:
+        return self._name
+
+    def forward(self, ref, inf) -> torch.Tensor:
+        """Time-domain MSE loss forward.
+
+        Args:
+            ref: (Batch, T) or (Batch, T, C)
+            inf: (Batch, T) or (Batch, T, C)
+        Returns:
+            loss: (Batch,)
+        """
+        assert ref.shape == inf.shape, (ref.shape, inf.shape)
+
+        mseloss = (ref - inf).pow(2)
+        if ref.dim() == 3:
+            mseloss = mseloss.mean(dim=[1, 2])
+        elif ref.dim() == 2:
+            mseloss = mseloss.mean(dim=1)
+        else:
+            raise ValueError(
+                "Invalid input shape: ref={}, inf={}".format(ref.shape, inf.shape)
+            )
+        return mseloss
+
+
+class TimeDomainL1(TimeDomainLoss):
+    def __init__(self, name=None):
+        super().__init__()
+        self._name = "TD_L1_loss" if name is None else name
+
+    @property
+    def name(self) -> str:
+        return self._name
+
+    def forward(self, ref, inf) -> torch.Tensor:
+        """Time-domain L1 loss forward.
+
+        Args:
+            ref: (Batch, T) or (Batch, T, C)
+            inf: (Batch, T) or (Batch, T, C)
+        Returns:
+            loss: (Batch,)
+        """
+        assert ref.shape == inf.shape, (ref.shape, inf.shape)
+
+        l1loss = abs(ref - inf)
+        if ref.dim() == 3:
+            l1loss = l1loss.mean(dim=[1, 2])
+        elif ref.dim() == 2:
+            l1loss = l1loss.mean(dim=1)
+        else:
+            raise ValueError(
+                "Invalid input shape: ref={}, inf={}".format(ref.shape, inf.shape)
+            )
+        return l1loss
diff --git a/espnet2/enh/loss/wrappers/abs_wrapper.py b/espnet2/enh/loss/wrappers/abs_wrapper.py
index b0761272248..e48a2b7f869 100644
--- a/espnet2/enh/loss/wrappers/abs_wrapper.py
+++ b/espnet2/enh/loss/wrappers/abs_wrapper.py
@@ -8,6 +8,7 @@
 
 
 class AbsLossWrapper(torch.nn.Module, ABC):
+    """Base class for all Enhancement loss wrapper modules."""
 
     # The weight for the current loss in the multi-task learning.
     # The overall training target will be combined as:
diff --git a/espnet2/enh/loss/wrappers/dpcl_solver.py b/espnet2/enh/loss/wrappers/dpcl_solver.py
new file mode 100644
index 00000000000..434f44ea7e2
--- /dev/null
+++ b/espnet2/enh/loss/wrappers/dpcl_solver.py
@@ -0,0 +1,32 @@
+from espnet2.enh.loss.criterions.abs_loss import AbsEnhLoss
+from espnet2.enh.loss.wrappers.abs_wrapper import AbsLossWrapper
+
+
+class DPCLSolver(AbsLossWrapper):
+    def __init__(self, criterion: AbsEnhLoss, weight=1.0):
+        super().__init__()
+        self.criterion = criterion
+        self.weight = weight
+
+    def forward(self, ref, inf, others={}):
+        """A naive DPCL solver
+
+        Args:
+            ref (List[torch.Tensor]): [(batch, ...), ...] x n_spk
+            inf (List[torch.Tensor]): [(batch, ...), ...]
+            others (List): other data included in this solver
+                e.g. "tf_embedding" learned embedding of all T-F bins (B, T * F, D)
+
+        Returns:
+            loss: (torch.Tensor): minimum loss with the best permutation
+            stats: (dict), for collecting training status
+            others: reserved
+        """
+        assert "tf_embedding" in others
+
+        loss = self.criterion(ref, others["tf_embedding"]).mean()
+
+        stats = dict()
+        stats[self.criterion.name] = loss.detach()
+
+        return loss.mean(), stats, {}
diff --git a/espnet2/enh/loss/wrappers/fixed_order.py b/espnet2/enh/loss/wrappers/fixed_order.py
index d37bcfe80d6..2bf1bc0f079 100644
--- a/espnet2/enh/loss/wrappers/fixed_order.py
+++ b/espnet2/enh/loss/wrappers/fixed_order.py
@@ -1,3 +1,5 @@
+from collections import defaultdict
+
 import torch
 
 from espnet2.enh.loss.criterions.abs_loss import AbsEnhLoss
@@ -26,11 +28,14 @@ def forward(self, ref, inf, others={}):
         num_spk = len(ref)
 
         loss = 0.0
-
+        stats = defaultdict(list)
         for r, i in zip(ref, inf):
             loss += torch.mean(self.criterion(r, i)) / num_spk
+            for k, v in getattr(self.criterion, "stats", {}).items():
+                stats[k].append(v)
 
-        stats = dict()
+        for k, v in stats.items():
+            stats[k] = torch.stack(v, dim=1).mean()
         stats[self.criterion.name] = loss.detach()
 
-        return loss.mean(), stats, {}
+        return loss.mean(), dict(stats), {}
diff --git a/espnet2/enh/loss/wrappers/multilayer_pit_solver.py b/espnet2/enh/loss/wrappers/multilayer_pit_solver.py
new file mode 100644
index 00000000000..34ad1d1f28b
--- /dev/null
+++ b/espnet2/enh/loss/wrappers/multilayer_pit_solver.py
@@ -0,0 +1,63 @@
+import torch
+
+from espnet2.enh.loss.criterions.abs_loss import AbsEnhLoss
+from espnet2.enh.loss.wrappers.abs_wrapper import AbsLossWrapper
+from espnet2.enh.loss.wrappers.pit_solver import PITSolver
+
+
+class MultiLayerPITSolver(AbsLossWrapper):
+    def __init__(
+        self,
+        criterion: AbsEnhLoss,
+        weight=1.0,
+        independent_perm=True,
+    ):
+        """Multi-Layer Permutation Invariant Training Solver.
+
+        Compute the PIT loss given inferences of multiple layers and a single reference.
+        It also support single inference and single reference in evaluation stage.
+
+        Args:
+            criterion (AbsEnhLoss): an instance of AbsEnhLoss
+            weight (float): weight (between 0 and 1) of current loss
+                for multi-task learning.
+            independent_perm (bool):
+                If True, PIT will be performed in forward to find the best permutation;
+                If False, the permutation from the last LossWrapper output will be
+                inherited.
+                Note: You should be careful about the ordering of loss
+                wrappers defined in the yaml config, if this argument is False.
+        """
+        super().__init__()
+        self.criterion = criterion
+        self.weight = weight
+        self.independent_perm = independent_perm
+        self.solver = PITSolver(criterion, weight, independent_perm)
+
+    def forward(self, ref, infs, others={}):
+        """Permutation invariant training solver.
+
+        Args:
+            ref (List[torch.Tensor]): [(batch, ...), ...] x n_spk
+            infs (Union[List[torch.Tensor], List[List[torch.Tensor]]]):
+                [(batch, ...), ...]
+
+        Returns:
+            loss: (torch.Tensor): minimum loss with the best permutation
+            stats: dict, for collecting training status
+            others: dict, in this PIT solver, permutation order will be returned
+        """
+        losses = 0.0
+        # In single-layer case, the model only estimates waveforms in the last layer.
+        # The shape of infs is List[torch.Tensor]
+        if torch.is_tensor(infs[0]) and len(infs) == len(ref):
+            loss, stats, others = self.solver(ref, infs, others)
+            losses = loss
+        # In multi-layer case, weighted-sum the PIT loss of each layer
+        # The shape of ins is List[List[torch.Tensor]]
+        else:
+            for idx, inf in enumerate(infs):
+                loss, stats, others = self.solver(ref, inf, others)
+                losses = losses + loss * (idx + 1) * (1.0 / len(infs))
+            losses = losses / len(infs)
+        return losses, stats, others
diff --git a/espnet2/enh/loss/wrappers/pit_solver.py b/espnet2/enh/loss/wrappers/pit_solver.py
index 6860c7fd416..9cb810f5c9b 100644
--- a/espnet2/enh/loss/wrappers/pit_solver.py
+++ b/espnet2/enh/loss/wrappers/pit_solver.py
@@ -1,3 +1,4 @@
+from collections import defaultdict
 from itertools import permutations
 
 import torch
@@ -8,13 +9,26 @@
 
 class PITSolver(AbsLossWrapper):
     def __init__(self, criterion: AbsEnhLoss, weight=1.0, independent_perm=True):
+        """Permutation Invariant Training Solver.
+
+        Args:
+            criterion (AbsEnhLoss): an instance of AbsEnhLoss
+            weight (float): weight (between 0 and 1) of current loss
+                for multi-task learning.
+            independent_perm (bool):
+                If True, PIT will be performed in forward to find the best permutation;
+                If False, the permutation from the last LossWrapper output will be
+                inherited.
+                NOTE (wangyou): You should be careful about the ordering of loss
+                    wrappers defined in the yaml config, if this argument is False.
+        """
         super().__init__()
         self.criterion = criterion
         self.weight = weight
         self.independent_perm = independent_perm
 
     def forward(self, ref, inf, others={}):
-        """Permutation invariant training solver.
+        """PITSolver forward.
 
         Args:
             ref (List[torch.Tensor]): [(batch, ...), ...] x n_spk
@@ -30,9 +44,20 @@ def forward(self, ref, inf, others={}):
         assert len(ref) == len(inf), (len(ref), len(inf))
         num_spk = len(ref)
 
+        stats = defaultdict(list)
+
+        def pre_hook(func, *args, **kwargs):
+            ret = func(*args, **kwargs)
+            for k, v in getattr(self.criterion, "stats", {}).items():
+                stats[k].append(v)
+            return ret
+
         def pair_loss(permutation):
             return sum(
-                [self.criterion(ref[s], inf[t]) for s, t in enumerate(permutation)]
+                [
+                    pre_hook(self.criterion, ref[s], inf[t])
+                    for s, t in enumerate(permutation)
+                ]
             ) / len(permutation)
 
         if self.independent_perm or perm is None:
@@ -40,19 +65,33 @@ def pair_loss(permutation):
             device = ref[0].device
             all_permutations = list(permutations(range(num_spk)))
             losses = torch.stack([pair_loss(p) for p in all_permutations], dim=1)
-            loss, perm = torch.min(losses, dim=1)
+            loss, perm_ = torch.min(losses, dim=1)
             perm = torch.index_select(
                 torch.tensor(all_permutations, device=device, dtype=torch.long),
                 0,
-                perm,
+                perm_,
             )
+            # remove stats from unused permutations
+            for k, v in stats.items():
+                # (B, len(all_permutations), ...)
+                new_v = torch.stack(v, dim=1)
+                if new_v.dim() > 2:
+                    shapes = [1 for _ in range(new_v.dim() - 2)]
+                    perm0 = perm_.view(perm_.shape[0], 1, *shapes).expand(
+                        -1, -1, *new_v.shape[2:]
+                    )
+                else:
+                    perm0 = perm_.unsqueeze(1)
+                stats[k] = new_v.gather(1, perm0.to(device=new_v.device)).unbind(1)
         else:
             loss = torch.tensor(
                 [
                     torch.tensor(
                         [
-                            self.criterion(
-                                ref[s][batch].unsqueeze(0), inf[t][batch].unsqueeze(0)
+                            pre_hook(
+                                self.criterion,
+                                ref[s][batch].unsqueeze(0),
+                                inf[t][batch].unsqueeze(0),
                             )
                             for s, t in enumerate(p)
                         ]
@@ -63,7 +102,8 @@ def pair_loss(permutation):
 
         loss = loss.mean()
 
-        stats = dict()
+        for k, v in stats.items():
+            stats[k] = torch.stack(v, dim=1).mean()
         stats[self.criterion.name] = loss.detach()
 
-        return loss.mean(), stats, {"perm": perm}
+        return loss.mean(), dict(stats), {"perm": perm}
diff --git a/espnet2/enh/separator/abs_separator.py b/espnet2/enh/separator/abs_separator.py
index 8b9de626026..72fe2eea918 100644
--- a/espnet2/enh/separator/abs_separator.py
+++ b/espnet2/enh/separator/abs_separator.py
@@ -1,6 +1,8 @@
 from abc import ABC
 from abc import abstractmethod
 from collections import OrderedDict
+from typing import Dict
+from typing import Optional
 from typing import Tuple
 
 import torch
@@ -12,6 +14,7 @@ def forward(
         self,
         input: torch.Tensor,
         ilens: torch.Tensor,
+        additional: Optional[Dict] = None,
     ) -> Tuple[Tuple[torch.Tensor], torch.Tensor, OrderedDict]:
 
         raise NotImplementedError
diff --git a/espnet2/enh/separator/asteroid_models.py b/espnet2/enh/separator/asteroid_models.py
index aad0b1f97af..2310929c1e3 100644
--- a/espnet2/enh/separator/asteroid_models.py
+++ b/espnet2/enh/separator/asteroid_models.py
@@ -1,4 +1,6 @@
 from collections import OrderedDict
+from typing import Dict
+from typing import Optional
 from typing import Tuple
 import warnings
 
@@ -66,12 +68,18 @@ def __init__(
         if loss_type != "si_snr":
             raise ValueError("Unsupported loss type: %s" % loss_type)
 
-    def forward(self, input: torch.Tensor, ilens: torch.Tensor = None):
+    def forward(
+        self,
+        input: torch.Tensor,
+        ilens: torch.Tensor = None,
+        additional: Optional[Dict] = None,
+    ):
         """Whole forward of asteroid models.
 
         Args:
             input (torch.Tensor): Raw Waveforms [B, T]
             ilens (torch.Tensor): input lengths [B]
+            additional (Dict or None): other data included in model
 
         Returns:
             estimated Waveforms(List[Union(torch.Tensor]): [(B, T), ...]
diff --git a/espnet2/enh/separator/conformer_separator.py b/espnet2/enh/separator/conformer_separator.py
index dbc1251d99d..5a9031f441d 100644
--- a/espnet2/enh/separator/conformer_separator.py
+++ b/espnet2/enh/separator/conformer_separator.py
@@ -1,6 +1,8 @@
 from collections import OrderedDict
 from distutils.version import LooseVersion
+from typing import Dict
 from typing import List
+from typing import Optional
 from typing import Tuple
 from typing import Union
 
@@ -118,13 +120,18 @@ def __init__(
         }[nonlinear]
 
     def forward(
-        self, input: Union[torch.Tensor, ComplexTensor], ilens: torch.Tensor
+        self,
+        input: Union[torch.Tensor, ComplexTensor],
+        ilens: torch.Tensor,
+        additional: Optional[Dict] = None,
     ) -> Tuple[List[Union[torch.Tensor, ComplexTensor]], torch.Tensor, OrderedDict]:
         """Forward.
 
         Args:
             input (torch.Tensor or ComplexTensor): Encoded feature [B, T, N]
             ilens (torch.Tensor): input lengths [Batch]
+            additional (Dict or None): other data included in model
+                NOTE: not used in this model
 
         Returns:
             masked (List[Union(torch.Tensor, ComplexTensor)]): [(B, T, N), ...]
diff --git a/espnet2/enh/separator/dan_separator.py b/espnet2/enh/separator/dan_separator.py
new file mode 100644
index 00000000000..d3b3222ae90
--- /dev/null
+++ b/espnet2/enh/separator/dan_separator.py
@@ -0,0 +1,169 @@
+from collections import OrderedDict
+from functools import reduce
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+import torch.nn.functional as Fun
+from torch_complex.tensor import ComplexTensor
+
+from espnet.nets.pytorch_backend.rnn.encoders import RNN
+from espnet2.enh.separator.abs_separator import AbsSeparator
+
+
+class DANSeparator(AbsSeparator):
+    def __init__(
+        self,
+        input_dim: int,
+        rnn_type: str = "blstm",
+        num_spk: int = 2,
+        nonlinear: str = "tanh",
+        layer: int = 2,
+        unit: int = 512,
+        emb_D: int = 40,
+        dropout: float = 0.0,
+    ):
+        """Deep Attractor Network Separator
+
+        Reference:
+            DEEP ATTRACTOR NETWORK FOR SINGLE-MICROPHONE SPEAKER SEPARATION;
+            Zhuo Chen. et al., 2017;
+            https://pubmed.ncbi.nlm.nih.gov/29430212/
+
+        Args:
+            input_dim: input feature dimension
+            rnn_type: string, select from 'blstm', 'lstm' etc.
+            bidirectional: bool, whether the inter-chunk RNN layers are bidirectional.
+            num_spk: number of speakers
+            nonlinear: the nonlinear function for mask estimation,
+                       select from 'relu', 'tanh', 'sigmoid'
+            layer: int, number of stacked RNN layers. Default is 3.
+            unit: int, dimension of the hidden state.
+            emb_D: int, dimension of the attribute vector for one tf-bin.
+            dropout: float, dropout ratio. Default is 0.
+        """
+        super().__init__()
+
+        self._num_spk = num_spk
+
+        self.blstm = RNN(
+            idim=input_dim,
+            elayers=layer,
+            cdim=unit,
+            hdim=unit,
+            dropout=dropout,
+            typ=rnn_type,
+        )
+
+        self.linear = torch.nn.Linear(unit, input_dim * emb_D)
+
+        if nonlinear not in ("sigmoid", "relu", "tanh"):
+            raise ValueError("Not supporting nonlinear={}".format(nonlinear))
+
+        self.nonlinear = {
+            "sigmoid": torch.nn.Sigmoid(),
+            "relu": torch.nn.ReLU(),
+            "tanh": torch.nn.Tanh(),
+        }[nonlinear]
+
+        self.D = emb_D
+
+    def forward(
+        self,
+        input: Union[torch.Tensor, ComplexTensor],
+        ilens: torch.Tensor,
+        additional: Optional[Dict] = None,
+    ) -> Tuple[List[Union[torch.Tensor, ComplexTensor]], torch.Tensor, OrderedDict]:
+        """Forward.
+
+        Args:
+            input (torch.Tensor or ComplexTensor): Encoded feature [B, T, F]
+            ilens (torch.Tensor): input lengths [Batch]
+            additional (Dict or None): other data included in model
+                e.g. "feature_ref": list of reference spectra List[(B, T, F)]
+
+        Returns:
+            masked (List[Union(torch.Tensor, ComplexTensor)]): [(B, T, N), ...]
+            ilens (torch.Tensor): (B,)
+            others predicted data, e.g. masks: OrderedDict[
+                'mask_spk1': torch.Tensor(Batch, Frames, Freq),
+                'mask_spk2': torch.Tensor(Batch, Frames, Freq),
+                ...
+                'mask_spkn': torch.Tensor(Batch, Frames, Freq),
+            ]
+        """
+        # if complex spectrum,
+        if isinstance(input, ComplexTensor):
+            feature = abs(input)
+        else:
+            feature = input
+        B, T, F = input.shape
+        # x:(B, T, F)
+        x, ilens, _ = self.blstm(feature, ilens)
+        # x:(B, T, F*D)
+        x = self.linear(x)
+        # x:(B, T, F*D)
+        x = self.nonlinear(x)
+        # tf_embedding:(B, T*F, D)
+        tf_embedding = x.contiguous().view(B, T * F, -1)
+
+        # Compute the attractors
+        if self.training:
+            assert additional is not None and "feature_ref" in additional
+            origin = additional["feature_ref"]
+            abs_origin = [abs(o) for o in origin]
+            Y_t = torch.zeros(B, T, F, device=origin[0].device)
+            for i in range(self._num_spk):
+                flags = [abs_origin[i] >= o for o in abs_origin]
+                Y = reduce(lambda x, y: x * y, flags)
+                Y = Y.int() * i
+                Y_t += Y
+            Y_t = Y_t.contiguous().flatten().long()
+            Y = Fun.one_hot(Y_t, num_classes=self._num_spk)
+            Y = Y.contiguous().view(B, -1, self._num_spk).float()
+
+            # v_y:(B, D, spks)
+            v_y = torch.bmm(torch.transpose(tf_embedding, 1, 2), Y)
+            # sum_y:(B, D, spks)
+            sum_y = torch.sum(Y, 1, keepdim=True).expand_as(v_y)
+            # attractor:(B, D, spks)
+            attractor = v_y / (sum_y + 1e-8)
+        else:
+            # K-means for batch
+            centers = tf_embedding[:, : self._num_spk, :].detach()
+            dist = torch.empty(B, T * F, self._num_spk, device=tf_embedding.device)
+            last_label = torch.zeros(B, T * F, device=tf_embedding.device)
+            while True:
+                for i in range(self._num_spk):
+                    dist[:, :, i] = torch.sum(
+                        (tf_embedding - centers[:, i, :].unsqueeze(1)) ** 2, dim=2
+                    )
+                label = dist.argmin(dim=2)
+                if torch.sum(label != last_label) == 0:
+                    break
+                last_label = label
+                for b in range(B):
+                    for i in range(self._num_spk):
+                        centers[b, i] = tf_embedding[b, label[b] == i].mean(dim=0)
+            attractor = centers.permute(0, 2, 1)
+
+        # calculate the distance between embeddings and attractors
+        # dist:(B, T*F, spks)
+        dist = torch.bmm(tf_embedding, attractor)
+        masks = torch.softmax(dist, dim=2)
+        masks = masks.contiguous().view(B, T, F, self._num_spk).unbind(dim=3)
+
+        masked = [input * m for m in masks]
+
+        others = OrderedDict(
+            zip(["mask_spk{}".format(i + 1) for i in range(len(masks))], masks)
+        )
+
+        return masked, ilens, others
+
+    @property
+    def num_spk(self):
+        return self._num_spk
diff --git a/espnet2/enh/separator/dc_crn_separator.py b/espnet2/enh/separator/dc_crn_separator.py
index 4f825a6e036..fa4ed14bc89 100644
--- a/espnet2/enh/separator/dc_crn_separator.py
+++ b/espnet2/enh/separator/dc_crn_separator.py
@@ -1,6 +1,8 @@
 from collections import OrderedDict
 from distutils.version import LooseVersion
+from typing import Dict
 from typing import List
+from typing import Optional
 from typing import Tuple
 from typing import Union
 
@@ -114,7 +116,10 @@ def __init__(
         )
 
     def forward(
-        self, input: Union[torch.Tensor, ComplexTensor], ilens: torch.Tensor
+        self,
+        input: Union[torch.Tensor, ComplexTensor],
+        ilens: torch.Tensor,
+        additional: Optional[Dict] = None,
     ) -> Tuple[List[Union[torch.Tensor, ComplexTensor]], torch.Tensor, OrderedDict]:
         """DC-CRN Separator Forward.
 
diff --git a/espnet2/enh/separator/dccrn_separator.py b/espnet2/enh/separator/dccrn_separator.py
index ac0e13b0217..a97def4e905 100644
--- a/espnet2/enh/separator/dccrn_separator.py
+++ b/espnet2/enh/separator/dccrn_separator.py
@@ -1,6 +1,8 @@
 from collections import OrderedDict
 from distutils.version import LooseVersion
+from typing import Dict
 from typing import List
+from typing import Optional
 from typing import Tuple
 from typing import Union
 
@@ -159,13 +161,18 @@ def __init__(
         self.flatten_parameters()
 
     def forward(
-        self, input: Union[torch.Tensor, ComplexTensor], ilens: torch.Tensor
+        self,
+        input: Union[torch.Tensor, ComplexTensor],
+        ilens: torch.Tensor,
+        additional: Optional[Dict] = None,
     ) -> Tuple[List[Union[torch.Tensor, ComplexTensor]], torch.Tensor, OrderedDict]:
         """Forward.
 
         Args:
             input (torch.Tensor or ComplexTensor): Encoded feature [B, T, F]
             ilens (torch.Tensor): input lengths [Batch]
+            additional (Dict or None): other data included in model
+                NOTE: not used in this model
 
         Returns:
             masked (List[Union(torch.Tensor, ComplexTensor)]): [(B, T, F), ...]
diff --git a/espnet2/enh/separator/dpcl_e2e_separator.py b/espnet2/enh/separator/dpcl_e2e_separator.py
new file mode 100644
index 00000000000..35264c5c137
--- /dev/null
+++ b/espnet2/enh/separator/dpcl_e2e_separator.py
@@ -0,0 +1,186 @@
+from collections import OrderedDict
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+from torch_complex.tensor import ComplexTensor
+
+from espnet.nets.pytorch_backend.rnn.encoders import RNN
+from espnet2.enh.separator.abs_separator import AbsSeparator
+
+
+class DPCLE2ESeparator(AbsSeparator):
+    def __init__(
+        self,
+        input_dim: int,
+        rnn_type: str = "blstm",
+        num_spk: int = 2,
+        nonlinear: str = "tanh",
+        layer: int = 2,
+        unit: int = 512,
+        emb_D: int = 40,
+        dropout: float = 0.0,
+        alpha: float = 5.0,
+        max_iteration: int = 500,
+        threshold: float = 1.0e-05,
+    ):
+        """Deep Clustering End-to-End Separator
+
+        References:
+            Single-Channel Multi-Speaker Separation using Deep Clustering;
+            Yusuf Isik. et al., 2016;
+            https://www.isca-speech.org/archive/interspeech_2016/isik16_interspeech.html
+
+        Args:
+            input_dim: input feature dimension
+            rnn_type: string, select from 'blstm', 'lstm' etc.
+            bidirectional: bool, whether the inter-chunk RNN layers are bidirectional.
+            num_spk: number of speakers
+            nonlinear: the nonlinear function for mask estimation,
+                       select from 'relu', 'tanh', 'sigmoid'
+            layer: int, number of stacked RNN layers. Default is 3.
+            unit: int, dimension of the hidden state.
+            emb_D: int, dimension of the feature vector for a tf-bin.
+            dropout: float, dropout ratio. Default is 0.
+            alpha: float, the clustering hardness parameter.
+            max_iteration: int, the max iterations of soft kmeans.
+            threshold: float, the threshold to end the soft k-means process.
+        """
+        super().__init__()
+
+        self._num_spk = num_spk
+
+        self.blstm = RNN(
+            idim=input_dim,
+            elayers=layer,
+            cdim=unit,
+            hdim=unit,
+            dropout=dropout,
+            typ=rnn_type,
+        )
+
+        self.linear = torch.nn.Linear(unit, input_dim * emb_D)
+
+        if nonlinear not in ("sigmoid", "relu", "tanh"):
+            raise ValueError("Not supporting nonlinear={}".format(nonlinear))
+
+        self.nonlinear = {
+            "sigmoid": torch.nn.Sigmoid(),
+            "relu": torch.nn.ReLU(),
+            "tanh": torch.nn.Tanh(),
+        }[nonlinear]
+
+        self.enh_blstm = RNN(
+            idim=input_dim * (num_spk + 1),
+            elayers=1,
+            cdim=unit,
+            hdim=unit,
+            dropout=dropout,
+            typ=rnn_type,
+        )
+
+        self.enh_linear = torch.nn.Linear(unit, input_dim * num_spk)
+
+        self.D = emb_D
+        self.alpha = alpha
+        self.max_iteration = max_iteration
+        self.threshold = threshold
+
+    def forward(
+        self,
+        input: Union[torch.Tensor, ComplexTensor],
+        ilens: torch.Tensor,
+        additional: Optional[Dict] = None,
+    ) -> Tuple[List[Union[torch.Tensor, ComplexTensor]], torch.Tensor, OrderedDict]:
+        """Forward.
+
+        Args:
+            input (torch.Tensor or ComplexTensor): Encoded feature [B, T, F]
+            ilens (torch.Tensor): input lengths [Batch]
+
+        Returns:
+            masked (List[Union(torch.Tensor, ComplexTensor)]): [(B, T, N), ...]
+            ilens (torch.Tensor): (B,)
+            others predicted data, e.g. V: OrderedDict[
+                others predicted data, e.g. masks: OrderedDict[
+                'mask_spk1': torch.Tensor(Batch, Frames, Freq),
+                'mask_spk2': torch.Tensor(Batch, Frames, Freq),
+                ...
+                'mask_spkn': torch.Tensor(Batch, Frames, Freq),
+            ]
+        """
+
+        # if complex spectrum,
+        if isinstance(input, ComplexTensor):
+            feature = abs(input)
+        else:
+            feature = input
+        B, T, F = input.shape
+
+        # 1st Stage
+        # x:(B, T, F)
+        x, ilens, _ = self.blstm(feature, ilens)
+        # x:(B, T, F*D)
+        x = self.linear(x)
+        # x:(B, T, F*D)
+        x = self.nonlinear(x)
+        V = x.view(B, -1, self.D)
+
+        # Soft KMeans
+        centers = V[:, : self._num_spk, :]
+        gamma = torch.zeros(B, T * F, self._num_spk, device=input.device)
+        count = 0
+        while True:
+            # Compute weight
+            gamma_exp = torch.empty(B, T * F, self._num_spk, device=input.device)
+            new_centers = torch.empty(B, self._num_spk, self.D, device=input.device)
+            for i in range(self._num_spk):
+                gamma_exp[:, :, i] = torch.exp(
+                    -self.alpha
+                    * torch.sum(V - centers[:, i, :].unsqueeze(1) ** 2, dim=2)
+                )
+            # To avoid grad becomes nan, we add a small constant in denominator
+            gamma = gamma_exp / (torch.sum(gamma_exp, dim=2, keepdim=True) + 1.0e-8)
+            # Update centers
+            for i in range(self._num_spk):
+                new_centers[:, i, :] = torch.sum(
+                    V * gamma[:, :, i].unsqueeze(2), dim=1
+                ) / (torch.sum(gamma[:, :, i].unsqueeze(2), dim=1) + 1.0e-8)
+
+            if (
+                torch.pow(new_centers - centers, 2).sum() < self.threshold
+                or count > self.max_iteration
+            ):
+                break
+
+            count += 1
+            centers = new_centers
+
+        masks = gamma.contiguous().view(B, T, F, self._num_spk).unbind(dim=3)
+        masked = [feature * m for m in masks]
+        masked.append(feature)
+
+        # 2nd Stage
+        # cat_source:(B, T, (spks+1)*F)
+        cat_source = torch.cat(masked, dim=2)
+        # cat_x:(B, T, spks*F)
+        cat_x, ilens, _ = self.enh_blstm(cat_source, ilens)
+        # z:(B, T, spks*F)
+        z = self.enh_linear(cat_x)
+        z = z.contiguous().view(B, T, F, self._num_spk)
+
+        enh_masks = torch.softmax(z, dim=3).unbind(dim=3)
+        enh_masked = [input * m for m in enh_masks]
+
+        others = OrderedDict(
+            zip(["mask_spk{}".format(i + 1) for i in range(len(enh_masks))], enh_masks)
+        )
+
+        return enh_masked, ilens, others
+
+    @property
+    def num_spk(self):
+        return self._num_spk
diff --git a/espnet2/enh/separator/dpcl_separator.py b/espnet2/enh/separator/dpcl_separator.py
new file mode 100644
index 00000000000..0eb0abf67e0
--- /dev/null
+++ b/espnet2/enh/separator/dpcl_separator.py
@@ -0,0 +1,142 @@
+from collections import OrderedDict
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+from torch_complex.tensor import ComplexTensor
+
+from espnet.nets.pytorch_backend.rnn.encoders import RNN
+from espnet2.enh.separator.abs_separator import AbsSeparator
+
+
+class DPCLSeparator(AbsSeparator):
+    def __init__(
+        self,
+        input_dim: int,
+        rnn_type: str = "blstm",
+        num_spk: int = 2,
+        nonlinear: str = "tanh",
+        layer: int = 2,
+        unit: int = 512,
+        emb_D: int = 40,
+        dropout: float = 0.0,
+    ):
+        """Deep Clustering Separator.
+
+        References:
+            [1] Deep clustering: Discriminative embeddings for segmentation and
+                separation; John R. Hershey. et al., 2016;
+                https://ieeexplore.ieee.org/document/7471631
+            [2] Manifold-Aware Deep Clustering: Maximizing Angles Between Embedding
+                Vectors Based on Regular Simplex; Tanaka, K. et al., 2021;
+                https://www.isca-speech.org/archive/interspeech_2021/tanaka21_interspeech.html
+
+        Args:
+            input_dim: input feature dimension
+            rnn_type: string, select from 'blstm', 'lstm' etc.
+            bidirectional: bool, whether the inter-chunk RNN layers are bidirectional.
+            num_spk: number of speakers
+            nonlinear: the nonlinear function for mask estimation,
+                       select from 'relu', 'tanh', 'sigmoid'
+            layer: int, number of stacked RNN layers. Default is 3.
+            unit: int, dimension of the hidden state.
+            emb_D: int, dimension of the feature vector for a tf-bin.
+            dropout: float, dropout ratio. Default is 0.
+        """  # noqa: E501
+        super().__init__()
+
+        self._num_spk = num_spk
+
+        self.blstm = RNN(
+            idim=input_dim,
+            elayers=layer,
+            cdim=unit,
+            hdim=unit,
+            dropout=dropout,
+            typ=rnn_type,
+        )
+
+        self.linear = torch.nn.Linear(unit, input_dim * emb_D)
+
+        if nonlinear not in ("sigmoid", "relu", "tanh"):
+            raise ValueError("Not supporting nonlinear={}".format(nonlinear))
+
+        self.nonlinear = {
+            "sigmoid": torch.nn.Sigmoid(),
+            "relu": torch.nn.ReLU(),
+            "tanh": torch.nn.Tanh(),
+        }[nonlinear]
+
+        self.D = emb_D
+
+    def forward(
+        self,
+        input: Union[torch.Tensor, ComplexTensor],
+        ilens: torch.Tensor,
+        additional: Optional[Dict] = None,
+    ) -> Tuple[List[Union[torch.Tensor, ComplexTensor]], torch.Tensor, OrderedDict]:
+        """Forward.
+
+        Args:
+            input (torch.Tensor or ComplexTensor): Encoded feature [B, T, F]
+            ilens (torch.Tensor): input lengths [Batch]
+            additional (Dict or None): other data included in model
+                NOTE: not used in this model
+
+        Returns:
+            masked (List[Union(torch.Tensor, ComplexTensor)]): [(B, T, N), ...]
+            ilens (torch.Tensor): (B,)
+            others predicted data, e.g. tf_embedding: OrderedDict[
+                'tf_embedding': learned embedding of all T-F bins (B, T * F, D),
+            ]
+        """
+        # if complex spectrum,
+        if isinstance(input, ComplexTensor):
+            feature = abs(input)
+        else:
+            feature = input
+        B, T, F = input.shape
+        # x:(B, T, F)
+        x, ilens, _ = self.blstm(feature, ilens)
+        # x:(B, T, F*D)
+        x = self.linear(x)
+        # x:(B, T, F*D)
+        x = self.nonlinear(x)
+        tf_embedding = x.view(B, -1, self.D)
+
+        if self.training:
+            masked = None
+        else:
+            # K-means for batch
+            centers = tf_embedding[:, : self._num_spk, :].detach()
+            dist = torch.empty(B, T * F, self._num_spk, device=tf_embedding.device)
+            last_label = torch.zeros(B, T * F, device=tf_embedding.device)
+            while True:
+                for i in range(self._num_spk):
+                    dist[:, :, i] = torch.sum(
+                        (tf_embedding - centers[:, i, :].unsqueeze(1)) ** 2, dim=2
+                    )
+                label = dist.argmin(dim=2)
+                if torch.sum(label != last_label) == 0:
+                    break
+                last_label = label
+                for b in range(B):
+                    for i in range(self._num_spk):
+                        centers[b, i] = tf_embedding[b, label[b] == i].mean(dim=0)
+            label = label.view(B, T, F)
+            masked = []
+            for i in range(self._num_spk):
+                masked.append(input * (label == i))
+
+        others = OrderedDict(
+            {"tf_embedding": tf_embedding},
+        )
+
+        return masked, ilens, others
+
+    @property
+    def num_spk(self):
+        return self._num_spk
diff --git a/espnet2/enh/separator/dprnn_separator.py b/espnet2/enh/separator/dprnn_separator.py
index 1492d725d4b..ddf0962b15d 100644
--- a/espnet2/enh/separator/dprnn_separator.py
+++ b/espnet2/enh/separator/dprnn_separator.py
@@ -1,6 +1,8 @@
 from collections import OrderedDict
 from distutils.version import LooseVersion
+from typing import Dict
 from typing import List
+from typing import Optional
 from typing import Tuple
 from typing import Union
 
@@ -70,13 +72,18 @@ def __init__(
         }[nonlinear]
 
     def forward(
-        self, input: Union[torch.Tensor, ComplexTensor], ilens: torch.Tensor
+        self,
+        input: Union[torch.Tensor, ComplexTensor],
+        ilens: torch.Tensor,
+        additional: Optional[Dict] = None,
     ) -> Tuple[List[Union[torch.Tensor, ComplexTensor]], torch.Tensor, OrderedDict]:
         """Forward.
 
         Args:
             input (torch.Tensor or ComplexTensor): Encoded feature [B, T, N]
             ilens (torch.Tensor): input lengths [Batch]
+            additional (Dict or None): other data included in model
+                NOTE: not used in this model
 
         Returns:
             masked (List[Union(torch.Tensor, ComplexTensor)]): [(B, T, N), ...]
diff --git a/espnet2/enh/separator/fasnet_separator.py b/espnet2/enh/separator/fasnet_separator.py
index a867efddeef..20f83b80046 100644
--- a/espnet2/enh/separator/fasnet_separator.py
+++ b/espnet2/enh/separator/fasnet_separator.py
@@ -1,6 +1,8 @@
 from collections import OrderedDict
 from distutils.version import LooseVersion
+from typing import Dict
 from typing import List
+from typing import Optional
 from typing import Tuple
 
 import torch
@@ -68,13 +70,18 @@ def __init__(
         )
 
     def forward(
-        self, input: torch.Tensor, ilens: torch.Tensor
+        self,
+        input: torch.Tensor,
+        ilens: torch.Tensor,
+        additional: Optional[Dict] = None,
     ) -> Tuple[List[torch.Tensor], torch.Tensor, OrderedDict]:
         """Forward.
 
         Args:
             input (torch.Tensor): (Batch, samples, channels)
             ilens (torch.Tensor): input lengths [Batch]
+            additional (Dict or None): other data included in model
+                NOTE: not used in this model
 
         Returns:
             separated (List[Union(torch.Tensor, ComplexTensor)]): [(B, T, N), ...]
diff --git a/espnet2/enh/separator/neural_beamformer.py b/espnet2/enh/separator/neural_beamformer.py
index 0d236183772..dff26d6f66c 100644
--- a/espnet2/enh/separator/neural_beamformer.py
+++ b/espnet2/enh/separator/neural_beamformer.py
@@ -1,5 +1,7 @@
 from collections import OrderedDict
+from typing import Dict
 from typing import List
+from typing import Optional
 from typing import Tuple
 from typing import Union
 
@@ -125,7 +127,10 @@ def __init__(
         self.shared_power = shared_power and use_wpe
 
     def forward(
-        self, input: Union[torch.Tensor, ComplexTensor], ilens: torch.Tensor
+        self,
+        input: Union[torch.Tensor, ComplexTensor],
+        ilens: torch.Tensor,
+        additional: Optional[Dict] = None,
     ) -> Tuple[List[Union[torch.Tensor, ComplexTensor]], torch.Tensor, OrderedDict]:
         """Forward.
 
@@ -133,6 +138,8 @@ def forward(
             input (torch.complex64/ComplexTensor):
                 mixed speech [Batch, Frames, Channel, Freq]
             ilens (torch.Tensor): input lengths [Batch]
+            additional (Dict or None): other data included in model
+                NOTE: not used in this model
 
         Returns:
             enhanced speech (single-channel): List[torch.complex64/ComplexTensor]
diff --git a/espnet2/enh/separator/rnn_separator.py b/espnet2/enh/separator/rnn_separator.py
index 032f7e5f869..1294c0e1ffb 100644
--- a/espnet2/enh/separator/rnn_separator.py
+++ b/espnet2/enh/separator/rnn_separator.py
@@ -1,6 +1,8 @@
 from collections import OrderedDict
 from distutils.version import LooseVersion
+from typing import Dict
 from typing import List
+from typing import Optional
 from typing import Tuple
 from typing import Union
 
@@ -66,13 +68,18 @@ def __init__(
         }[nonlinear]
 
     def forward(
-        self, input: Union[torch.Tensor, ComplexTensor], ilens: torch.Tensor
+        self,
+        input: Union[torch.Tensor, ComplexTensor],
+        ilens: torch.Tensor,
+        additional: Optional[Dict] = None,
     ) -> Tuple[List[Union[torch.Tensor, ComplexTensor]], torch.Tensor, OrderedDict]:
         """Forward.
 
         Args:
             input (torch.Tensor or ComplexTensor): Encoded feature [B, T, N]
             ilens (torch.Tensor): input lengths [Batch]
+            additional (Dict or None): other data included in model
+                NOTE: not used in this model
 
         Returns:
             masked (List[Union(torch.Tensor, ComplexTensor)]): [(B, T, N), ...]
diff --git a/espnet2/enh/separator/skim_separator.py b/espnet2/enh/separator/skim_separator.py
index 2f58421df32..15dd467ea53 100644
--- a/espnet2/enh/separator/skim_separator.py
+++ b/espnet2/enh/separator/skim_separator.py
@@ -1,5 +1,7 @@
 from collections import OrderedDict
+from typing import Dict
 from typing import List
+from typing import Optional
 from typing import Tuple
 from typing import Union
 
@@ -80,13 +82,18 @@ def __init__(
         }[nonlinear]
 
     def forward(
-        self, input: Union[torch.Tensor, ComplexTensor], ilens: torch.Tensor
+        self,
+        input: Union[torch.Tensor, ComplexTensor],
+        ilens: torch.Tensor,
+        additional: Optional[Dict] = None,
     ) -> Tuple[List[Union[torch.Tensor, ComplexTensor]], torch.Tensor, OrderedDict]:
         """Forward.
 
         Args:
             input (torch.Tensor or ComplexTensor): Encoded feature [B, T, N]
             ilens (torch.Tensor): input lengths [Batch]
+            additional (Dict or None): other data included in model
+                NOTE: not used in this model
 
         Returns:
             masked (List[Union(torch.Tensor, ComplexTensor)]): [(B, T, N), ...]
diff --git a/espnet2/enh/separator/svoice_separator.py b/espnet2/enh/separator/svoice_separator.py
new file mode 100644
index 00000000000..6a7fce40cba
--- /dev/null
+++ b/espnet2/enh/separator/svoice_separator.py
@@ -0,0 +1,203 @@
+from collections import OrderedDict
+import math
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from espnet2.enh.layers.dpmulcat import DPMulCat
+from espnet2.enh.layers.dprnn import merge_feature
+from espnet2.enh.layers.dprnn import split_feature
+from espnet2.enh.separator.abs_separator import AbsSeparator
+
+
+def overlap_and_add(signal, frame_step):
+    """Reconstructs a signal from a framed representation.
+
+        Adds potentially overlapping frames of a signal with shape
+        `[..., frames, frame_length]`, offsetting subsequent frames by `frame_step`.
+        The resulting tensor has shape `[..., output_size]` where
+            output_size = (frames - 1) * frame_step + frame_length
+
+        Args:
+            signal: A [..., frames, frame_length] Tensor. All dimensions may be unknown,
+                and rank must be at least 2.
+            frame_step: An integer denoting overlap offsets.
+                Must be less than or equal to frame_length.
+
+        Returns:
+            A Tensor with shape [..., output_size] containing the
+                overlap-added frames of signal's inner-most two dimensions.
+            output_size = (frames - 1) * frame_step + frame_length
+
+        Based on
+
+    https://github.com/tensorflow/tensorflow/blob/r1.12/tensorflow/contrib/signal/python/ops/reconstruction_ops.py
+    """
+    outer_dimensions = signal.size()[:-2]
+    frames, frame_length = signal.size()[-2:]
+
+    # gcd=Greatest Common Divisor
+    subframe_length = math.gcd(frame_length, frame_step)
+    subframe_step = frame_step // subframe_length
+    subframes_per_frame = frame_length // subframe_length
+    output_size = frame_step * (frames - 1) + frame_length
+    output_subframes = output_size // subframe_length
+
+    subframe_signal = signal.view(*outer_dimensions, -1, subframe_length)
+
+    frame = torch.arange(0, output_subframes).unfold(
+        0, subframes_per_frame, subframe_step
+    )
+    frame = frame.clone().detach().long().to(signal.device)
+    # frame = signal.new_tensor(frame).clone().long()  # signal may in GPU or CPU
+    frame = frame.contiguous().view(-1)
+
+    result = signal.new_zeros(*outer_dimensions, output_subframes, subframe_length)
+    result.index_add_(-2, frame, subframe_signal)
+    result = result.view(*outer_dimensions, -1)
+    return result
+
+
+class Encoder(nn.Module):
+    def __init__(self, enc_kernel_size: int, enc_feat_dim: int):
+        super().__init__()
+        # setting 50% overlap
+        self.conv = nn.Conv1d(
+            1,
+            enc_feat_dim,
+            kernel_size=enc_kernel_size,
+            stride=enc_kernel_size // 2,
+            bias=False,
+        )
+        self.nonlinear = nn.ReLU()
+
+    def forward(self, mixture):
+        mixture = torch.unsqueeze(mixture, 1)
+        mixture_w = self.nonlinear(self.conv(mixture))
+        return mixture_w
+
+
+class Decoder(nn.Module):
+    def __init__(self, kernel_size):
+        super().__init__()
+        self.kernel_size = kernel_size
+
+    def forward(self, est_source):
+        est_source = torch.transpose(est_source, 2, 3)
+        est_source = nn.AvgPool2d((1, self.kernel_size))(est_source)
+        est_source = overlap_and_add(est_source, self.kernel_size // 2)
+
+        return est_source
+
+
+class SVoiceSeparator(AbsSeparator):
+    """SVoice model for speech separation.
+
+    Reference:
+        Voice Separation with an Unknown Number of Multiple Speakers;
+        E. Nachmani et al., 2020;
+        https://arxiv.org/abs/2003.01531
+
+    Args:
+        enc_dim: int, dimension of the encoder module's output. (Default: 128)
+        kernel_size: int, the kernel size of Conv1D layer in both encoder and
+            decoder modules. (Default: 8)
+        hidden_size: int, dimension of the hidden state in RNN layers. (Default: 128)
+        num_spk: int, the number of speakers in the output. (Default: 2)
+        num_layers: int, number of stacked MulCat blocks. (Default: 4)
+        segment_size: dual-path segment size. (Default: 20)
+        bidirectional: bool, whether the RNN layers are bidirectional. (Default: True)
+        input_normalize: bool, whether to apply GroupNorm on the input Tensor.
+            (Default: False)
+    """
+
+    def __init__(
+        self,
+        input_dim: int,
+        enc_dim: int,
+        kernel_size: int,
+        hidden_size: int,
+        num_spk: int = 2,
+        num_layers: int = 4,
+        segment_size: int = 20,
+        bidirectional: bool = True,
+        input_normalize: bool = False,
+    ):
+        super().__init__()
+
+        self._num_spk = num_spk
+        self.enc_dim = enc_dim
+        self.segment_size = segment_size
+        # model sub-networks
+        self.encoder = Encoder(kernel_size, enc_dim)
+        self.decoder = Decoder(kernel_size)
+        self.rnn_model = DPMulCat(
+            input_size=enc_dim,
+            hidden_size=hidden_size,
+            output_size=enc_dim,
+            num_spk=num_spk,
+            num_layers=num_layers,
+            bidirectional=bidirectional,
+            input_normalize=input_normalize,
+        )
+
+    def forward(
+        self,
+        input: torch.Tensor,
+        ilens: torch.Tensor,
+        additional: Optional[Dict] = None,
+    ) -> Tuple[List[torch.Tensor], torch.Tensor, OrderedDict]:
+        """Forward.
+
+        Args:
+            input (torch.Tensor or ComplexTensor): Encoded feature [B, T, N]
+            ilens (torch.Tensor): input lengths [Batch]
+            additional (Dict or None): other data included in model
+                NOTE: not used in this model
+
+        Returns:
+            masked (List[Union(torch.Tensor, ComplexTensor)]): [(B, T, N), ...]
+            ilens (torch.Tensor): (B,)
+            others predicted data, e.g. masks: OrderedDict[
+                'mask_spk1': torch.Tensor(Batch, Frames, Freq),
+                'mask_spk2': torch.Tensor(Batch, Frames, Freq),
+                ...
+                'mask_spkn': torch.Tensor(Batch, Frames, Freq),
+            ]
+        """
+        # fix time dimension, might change due to convolution operations
+        T_mix = input.size(-1)
+
+        mixture_w = self.encoder(input)
+
+        enc_segments, enc_rest = split_feature(mixture_w, self.segment_size)
+        # separate
+        output_all = self.rnn_model(enc_segments)
+
+        # generate wav after each RNN block and optimize the loss
+        outputs = []
+        for ii in range(len(output_all)):
+            output_ii = merge_feature(output_all[ii], enc_rest)
+            output_ii = output_ii.view(
+                input.shape[0], self._num_spk, self.enc_dim, mixture_w.shape[2]
+            )
+            output_ii = self.decoder(output_ii)
+            T_est = output_ii.size(-1)
+            output_ii = F.pad(output_ii, (0, T_mix - T_est))
+            output_ii = list(output_ii.unbind(dim=1))
+            if self.training:
+                outputs.append(output_ii)
+            else:
+                outputs = output_ii
+
+        others = {}
+        return outputs, ilens, others
+
+    @property
+    def num_spk(self):
+        return self._num_spk
diff --git a/espnet2/enh/separator/tcn_separator.py b/espnet2/enh/separator/tcn_separator.py
index 56f7e053e01..8d82103ec5d 100644
--- a/espnet2/enh/separator/tcn_separator.py
+++ b/espnet2/enh/separator/tcn_separator.py
@@ -1,6 +1,8 @@
 from collections import OrderedDict
 from distutils.version import LooseVersion
+from typing import Dict
 from typing import List
+from typing import Optional
 from typing import Tuple
 from typing import Union
 
@@ -65,13 +67,18 @@ def __init__(
         )
 
     def forward(
-        self, input: Union[torch.Tensor, ComplexTensor], ilens: torch.Tensor
+        self,
+        input: Union[torch.Tensor, ComplexTensor],
+        ilens: torch.Tensor,
+        additional: Optional[Dict] = None,
     ) -> Tuple[List[Union[torch.Tensor, ComplexTensor]], torch.Tensor, OrderedDict]:
         """Forward.
 
         Args:
             input (torch.Tensor or ComplexTensor): Encoded feature [B, T, N]
             ilens (torch.Tensor): input lengths [Batch]
+            additional (Dict or None): other data included in model
+                NOTE: not used in this model
 
         Returns:
             masked (List[Union(torch.Tensor, ComplexTensor)]): [(B, T, N), ...]
diff --git a/espnet2/enh/separator/transformer_separator.py b/espnet2/enh/separator/transformer_separator.py
index 346410e699e..ca4421221e7 100644
--- a/espnet2/enh/separator/transformer_separator.py
+++ b/espnet2/enh/separator/transformer_separator.py
@@ -1,6 +1,8 @@
 from collections import OrderedDict
 from distutils.version import LooseVersion
+from typing import Dict
 from typing import List
+from typing import Optional
 from typing import Tuple
 from typing import Union
 
@@ -105,13 +107,18 @@ def __init__(
         }[nonlinear]
 
     def forward(
-        self, input: Union[torch.Tensor, ComplexTensor], ilens: torch.Tensor
+        self,
+        input: Union[torch.Tensor, ComplexTensor],
+        ilens: torch.Tensor,
+        additional: Optional[Dict] = None,
     ) -> Tuple[List[Union[torch.Tensor, ComplexTensor]], torch.Tensor, OrderedDict]:
         """Forward.
 
         Args:
             input (torch.Tensor or ComplexTensor): Encoded feature [B, T, N]
             ilens (torch.Tensor): input lengths [Batch]
+            additional (Dict or None): other data included in model
+                NOTE: not used in this model
 
         Returns:
             masked (List[Union(torch.Tensor, ComplexTensor)]): [(B, T, N), ...]
diff --git a/espnet2/gan_tts/espnet_model.py b/espnet2/gan_tts/espnet_model.py
index cbb39cc682b..34ca845f0fd 100644
--- a/espnet2/gan_tts/espnet_model.py
+++ b/espnet2/gan_tts/espnet_model.py
@@ -74,6 +74,7 @@ def forward(
         sids: Optional[torch.Tensor] = None,
         lids: Optional[torch.Tensor] = None,
         forward_generator: bool = True,
+        **kwargs,
     ) -> Dict[str, Any]:
         """Return generator or discriminator loss with dict format.
 
@@ -92,6 +93,7 @@ def forward(
             sids (Optional[Tensor]): Speaker ID tensor (B, 1).
             lids (Optional[Tensor]): Language ID tensor (B, 1).
             forward_generator (bool): Whether to forward generator.
+            kwargs: "utt_id" is among the input.
 
         Returns:
             Dict[str, Any]:
@@ -176,6 +178,7 @@ def collect_feats(
         spembs: Optional[torch.Tensor] = None,
         sids: Optional[torch.Tensor] = None,
         lids: Optional[torch.Tensor] = None,
+        **kwargs,
     ) -> Dict[str, torch.Tensor]:
         """Calculate features and return them as a dict.
 
diff --git a/espnet2/hubert/espnet_model.py b/espnet2/hubert/espnet_model.py
index bc5bd451bfd..4fa775841bc 100644
--- a/espnet2/hubert/espnet_model.py
+++ b/espnet2/hubert/espnet_model.py
@@ -97,6 +97,7 @@ def forward(
         speech_lengths: torch.Tensor,
         text: torch.Tensor,
         text_lengths: torch.Tensor,
+        **kwargs,
     ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
         """Frontend + Encoder + Calc loss
 
@@ -105,6 +106,7 @@ def forward(
             speech_lengths: (Batch, )
             text: (Batch, Length)
             text_lengths: (Batch,)
+            kwargs: "utt_id" is among the input.
         """
         assert text_lengths.dim() == 1, text_lengths.shape
         # Check that batch_size is unified
@@ -144,6 +146,7 @@ def collect_feats(
         speech_lengths: torch.Tensor,
         text: torch.Tensor,
         text_lengths: torch.Tensor,
+        **kwargs,
     ) -> Dict[str, torch.Tensor]:
         feats, feats_lengths = self._extract_feats(speech, speech_lengths)
         return {"feats": feats, "feats_lengths": feats_lengths}
diff --git a/espnet2/layers/utterance_mvn.py b/espnet2/layers/utterance_mvn.py
index a41f869f322..4f1adb3e53b 100644
--- a/espnet2/layers/utterance_mvn.py
+++ b/espnet2/layers/utterance_mvn.py
@@ -76,7 +76,7 @@ def utterance_mvn(
         if norm_vars:
             var = x.pow(2).sum(dim=1, keepdim=True) / ilens_
             std = torch.clamp(var.sqrt(), min=eps)
-            x = x / std.sqrt()
+            x = x / std
         return x, ilens
     else:
         if norm_vars:
diff --git a/espnet2/lm/espnet_model.py b/espnet2/lm/espnet_model.py
index 0309ee4ffb0..de6cd114a25 100644
--- a/espnet2/lm/espnet_model.py
+++ b/espnet2/lm/espnet_model.py
@@ -114,7 +114,10 @@ def batchify_nll(
         return nll, x_lengths
 
     def forward(
-        self, text: torch.Tensor, text_lengths: torch.Tensor
+        self,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+        **kwargs,
     ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
         nll, y_lengths = self.nll(text, text_lengths)
         ntokens = y_lengths.sum()
@@ -126,6 +129,9 @@ def forward(
         return loss, stats, weight
 
     def collect_feats(
-        self, text: torch.Tensor, text_lengths: torch.Tensor
+        self,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+        **kwargs,
     ) -> Dict[str, torch.Tensor]:
         return {}
diff --git a/espnet2/main_funcs/calculate_all_attentions.py b/espnet2/main_funcs/calculate_all_attentions.py
index ed53d2b89c5..52fe045779b 100644
--- a/espnet2/main_funcs/calculate_all_attentions.py
+++ b/espnet2/main_funcs/calculate_all_attentions.py
@@ -107,7 +107,7 @@ def hook(module, input, output, name=name):
     # Batch-mode can't be used to keep requirements small for each models.
     keys = []
     for k in batch:
-        if not k.endswith("_lengths"):
+        if not (k.endswith("_lengths") or k in ["utt_id"]):
             keys.append(k)
 
     return_dict = defaultdict(list)
@@ -128,6 +128,10 @@ def hook(module, input, output, name=name):
                 if k + "_lengths" in batch
             }
         )
+
+        if "utt_id" in batch:
+            _sample["utt_id"] = batch["utt_id"]
+
         model(**_sample)
 
         # Derive the attention results
diff --git a/espnet2/mt/espnet_model.py b/espnet2/mt/espnet_model.py
index f93b5d417b2..953d5bc02f8 100644
--- a/espnet2/mt/espnet_model.py
+++ b/espnet2/mt/espnet_model.py
@@ -120,6 +120,7 @@ def forward(
         text_lengths: torch.Tensor,
         src_text: torch.Tensor,
         src_text_lengths: torch.Tensor,
+        **kwargs,
     ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
         """Frontend + Encoder + Decoder + Calc loss
 
@@ -128,6 +129,7 @@ def forward(
             text_lengths: (Batch,)
             src_text: (Batch, length)
             src_text_lengths: (Batch,)
+            kwargs: "utt_id" is among the input.
         """
         assert text_lengths.dim() == 1, text_lengths.shape
         # Check that batch_size is unified
@@ -171,6 +173,7 @@ def collect_feats(
         text_lengths: torch.Tensor,
         src_text: torch.Tensor,
         src_text_lengths: torch.Tensor,
+        **kwargs,
     ) -> Dict[str, torch.Tensor]:
         if self.extract_feats_in_collect_stats:
             feats, feats_lengths = self._extract_feats(src_text, src_text_lengths)
diff --git a/espnet2/st/espnet_model.py b/espnet2/st/espnet_model.py
index f4d59d1a0cc..ee744681bd7 100644
--- a/espnet2/st/espnet_model.py
+++ b/espnet2/st/espnet_model.py
@@ -53,9 +53,9 @@ def __init__(
         decoder: AbsDecoder,
         extra_asr_decoder: Optional[AbsDecoder],
         extra_mt_decoder: Optional[AbsDecoder],
-        ctc: CTC,
-        src_vocab_size: int = 0,
-        src_token_list: Union[Tuple[str, ...], List[str]] = [],
+        ctc: Optional[CTC],
+        src_vocab_size: Optional[int],
+        src_token_list: Optional[Union[Tuple[str, ...], List[str]]],
         asr_weight: float = 0.0,
         mt_weight: float = 0.0,
         mtlalpha: float = 0.0,
@@ -78,6 +78,8 @@ def __init__(
         # note that eos is the same as sos (equivalent ID)
         self.sos = vocab_size - 1
         self.eos = vocab_size - 1
+        self.src_sos = src_vocab_size - 1
+        self.src_eos = src_vocab_size - 1
         self.vocab_size = vocab_size
         self.src_vocab_size = src_vocab_size
         self.ignore_id = ignore_id
@@ -165,6 +167,7 @@ def forward(
         text_lengths: torch.Tensor,
         src_text: Optional[torch.Tensor],
         src_text_lengths: Optional[torch.Tensor],
+        **kwargs,
     ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
         """Frontend + Encoder + Decoder + Calc loss
 
@@ -175,6 +178,7 @@ def forward(
             text_lengths: (Batch,)
             src_text: (Batch, length)
             src_text_lengths: (Batch,)
+            kwargs: "utt_id" is among the input.
         """
         assert text_lengths.dim() == 1, text_lengths.shape
         # Check that batch_size is unified
@@ -285,6 +289,7 @@ def collect_feats(
         text_lengths: torch.Tensor,
         src_text: Optional[torch.Tensor],
         src_text_lengths: Optional[torch.Tensor],
+        **kwargs,
     ) -> Dict[str, torch.Tensor]:
         if self.extract_feats_in_collect_stats:
             feats, feats_lengths = self._extract_feats(speech, speech_lengths)
@@ -409,7 +414,9 @@ def _calc_asr_att_loss(
         ys_pad: torch.Tensor,
         ys_pad_lens: torch.Tensor,
     ):
-        ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, self.ignore_id)
+        ys_in_pad, ys_out_pad = add_sos_eos(
+            ys_pad, self.src_sos, self.src_eos, self.ignore_id
+        )
         ys_in_lens = ys_pad_lens + 1
 
         # 1. Forward decoder
@@ -420,7 +427,7 @@ def _calc_asr_att_loss(
         # 2. Compute attention loss
         loss_att = self.criterion_asr(decoder_out, ys_out_pad)
         acc_att = th_accuracy(
-            decoder_out.view(-1, self.vocab_size),
+            decoder_out.view(-1, self.src_vocab_size),
             ys_out_pad,
             ignore_label=self.ignore_id,
         )
diff --git a/espnet2/tasks/enh.py b/espnet2/tasks/enh.py
index 633bcf1114c..fd0742359da 100644
--- a/espnet2/tasks/enh.py
+++ b/espnet2/tasks/enh.py
@@ -21,24 +21,35 @@
 from espnet2.enh.encoder.stft_encoder import STFTEncoder
 from espnet2.enh.espnet_model import ESPnetEnhancementModel
 from espnet2.enh.loss.criterions.abs_loss import AbsEnhLoss
+from espnet2.enh.loss.criterions.tf_domain import FrequencyDomainAbsCoherence
+from espnet2.enh.loss.criterions.tf_domain import FrequencyDomainDPCL
 from espnet2.enh.loss.criterions.tf_domain import FrequencyDomainL1
 from espnet2.enh.loss.criterions.tf_domain import FrequencyDomainMSE
 from espnet2.enh.loss.criterions.time_domain import CISDRLoss
+from espnet2.enh.loss.criterions.time_domain import SDRLoss
 from espnet2.enh.loss.criterions.time_domain import SISNRLoss
 from espnet2.enh.loss.criterions.time_domain import SNRLoss
+from espnet2.enh.loss.criterions.time_domain import TimeDomainL1
+from espnet2.enh.loss.criterions.time_domain import TimeDomainMSE
 from espnet2.enh.loss.wrappers.abs_wrapper import AbsLossWrapper
+from espnet2.enh.loss.wrappers.dpcl_solver import DPCLSolver
 from espnet2.enh.loss.wrappers.fixed_order import FixedOrderSolver
+from espnet2.enh.loss.wrappers.multilayer_pit_solver import MultiLayerPITSolver
 from espnet2.enh.loss.wrappers.pit_solver import PITSolver
 from espnet2.enh.separator.abs_separator import AbsSeparator
 from espnet2.enh.separator.asteroid_models import AsteroidModel_Converter
 from espnet2.enh.separator.conformer_separator import ConformerSeparator
+from espnet2.enh.separator.dan_separator import DANSeparator
 from espnet2.enh.separator.dc_crn_separator import DC_CRNSeparator
 from espnet2.enh.separator.dccrn_separator import DCCRNSeparator
+from espnet2.enh.separator.dpcl_e2e_separator import DPCLE2ESeparator
+from espnet2.enh.separator.dpcl_separator import DPCLSeparator
 from espnet2.enh.separator.dprnn_separator import DPRNNSeparator
 from espnet2.enh.separator.fasnet_separator import FaSNetSeparator
 from espnet2.enh.separator.neural_beamformer import NeuralBeamformer
 from espnet2.enh.separator.rnn_separator import RNNSeparator
 from espnet2.enh.separator.skim_separator import SkiMSeparator
+from espnet2.enh.separator.svoice_separator import SVoiceSeparator
 from espnet2.enh.separator.tcn_separator import TCNSeparator
 from espnet2.enh.separator.transformer_separator import TransformerSeparator
 from espnet2.tasks.abs_task import AbsTask
@@ -61,17 +72,21 @@
 separator_choices = ClassChoices(
     name="separator",
     classes=dict(
+        asteroid=AsteroidModel_Converter,
+        conformer=ConformerSeparator,
+        dan=DANSeparator,
+        dc_crn=DC_CRNSeparator,
+        dccrn=DCCRNSeparator,
+        dpcl=DPCLSeparator,
+        dpcl_e2e=DPCLE2ESeparator,
+        dprnn=DPRNNSeparator,
+        fasnet=FaSNetSeparator,
         rnn=RNNSeparator,
         skim=SkiMSeparator,
+        svoice=SVoiceSeparator,
         tcn=TCNSeparator,
-        dc_crn=DC_CRNSeparator,
-        dprnn=DPRNNSeparator,
-        dccrn=DCCRNSeparator,
         transformer=TransformerSeparator,
-        conformer=ConformerSeparator,
         wpe_beamformer=NeuralBeamformer,
-        asteroid=AsteroidModel_Converter,
-        fasnet=FaSNetSeparator,
     ),
     type_check=AbsSeparator,
     default="rnn",
@@ -86,7 +101,12 @@
 
 loss_wrapper_choices = ClassChoices(
     name="loss_wrappers",
-    classes=dict(pit=PITSolver, fixed_order=FixedOrderSolver),
+    classes=dict(
+        pit=PITSolver,
+        fixed_order=FixedOrderSolver,
+        multilayer_pit=MultiLayerPITSolver,
+        dpcl=DPCLSolver,
+    ),
     type_check=AbsLossWrapper,
     default=None,
 )
@@ -94,11 +114,18 @@
 criterion_choices = ClassChoices(
     name="criterions",
     classes=dict(
-        snr=SNRLoss,
         ci_sdr=CISDRLoss,
+        coh=FrequencyDomainAbsCoherence,
+        sdr=SDRLoss,
         si_snr=SISNRLoss,
-        mse=FrequencyDomainMSE,
+        snr=SNRLoss,
         l1=FrequencyDomainL1,
+        dpcl=FrequencyDomainDPCL,
+        l1_fd=FrequencyDomainL1,
+        l1_td=TimeDomainL1,
+        mse=FrequencyDomainMSE,
+        mse_fd=FrequencyDomainMSE,
+        mse_td=TimeDomainMSE,
     ),
     type_check=AbsEnhLoss,
     default=None,
@@ -233,12 +260,16 @@ def build_model(cls, args: argparse.Namespace) -> ESPnetEnhancementModel:
         decoder = decoder_choices.get_class(args.decoder)(**args.decoder_conf)
 
         loss_wrappers = []
-        for ctr in args.criterions:
-            criterion = criterion_choices.get_class(ctr["name"])(**ctr["conf"])
-            loss_wrapper = loss_wrapper_choices.get_class(ctr["wrapper"])(
-                criterion=criterion, **ctr["wrapper_conf"]
-            )
-            loss_wrappers.append(loss_wrapper)
+
+        if getattr(args, "criterions", None) is not None:
+            # This check is for the compatibility when load models
+            # that packed by older version
+            for ctr in args.criterions:
+                criterion = criterion_choices.get_class(ctr["name"])(**ctr["conf"])
+                loss_wrapper = loss_wrapper_choices.get_class(ctr["wrapper"])(
+                    criterion=criterion, **ctr["wrapper_conf"]
+                )
+                loss_wrappers.append(loss_wrapper)
 
         # 1. Build model
         model = ESPnetEnhancementModel(
diff --git a/espnet2/tasks/enh_asr.py b/espnet2/tasks/enh_asr.py
deleted file mode 100644
index c452ab2201d..00000000000
--- a/espnet2/tasks/enh_asr.py
+++ /dev/null
@@ -1,369 +0,0 @@
-import argparse
-import logging
-from typing import Callable
-from typing import Collection
-from typing import Dict
-from typing import List
-from typing import Optional
-from typing import Tuple
-
-import numpy as np
-import torch
-from typeguard import check_argument_types
-from typeguard import check_return_type
-
-from espnet2.asr.ctc import CTC
-from espnet2.asr.decoder.abs_decoder import AbsDecoder
-from espnet2.asr.decoder.rnn_decoder import RNNDecoder
-from espnet2.asr.decoder.transformer_decoder import TransformerDecoder
-from espnet2.asr.encoder.abs_encoder import AbsEncoder
-from espnet2.asr.encoder.rnn_encoder import RNNEncoder
-from espnet2.asr.encoder.transformer_encoder import TransformerEncoder
-from espnet2.asr.encoder.vgg_rnn_encoder import VGGRNNEncoder
-from espnet2.asr.espnet_joint_model import ESPnetEnhASRModel
-from espnet2.asr.espnet_model import ESPnetASRModel
-from espnet2.asr.frontend.abs_frontend import AbsFrontend
-from espnet2.asr.frontend.default import DefaultFrontend
-from espnet2.asr.specaug.abs_specaug import AbsSpecAug
-from espnet2.asr.specaug.specaug import SpecAug
-from espnet2.enh.abs_enh import AbsEnhancement
-from espnet2.enh.espnet_model import ESPnetEnhancementModel
-from espnet2.enh.nets.beamformer_net import BeamformerNet
-from espnet2.enh.nets.tasnet import TasNet
-from espnet2.enh.nets.tf_mask_net import TFMaskingNet
-from espnet2.layers.abs_normalize import AbsNormalize
-from espnet2.layers.global_mvn import GlobalMVN
-from espnet2.layers.utterance_mvn import UtteranceMVN
-from espnet2.tasks.abs_task import AbsTask
-from espnet2.text.phoneme_tokenizer import g2p_choices
-from espnet2.torch_utils.initialize import initialize
-from espnet2.train.class_choices import ClassChoices
-from espnet2.train.collate_fn import CommonCollateFn
-from espnet2.train.preprocessor import CommonPreprocessor_multi
-from espnet2.train.trainer import Trainer
-from espnet2.utils.get_default_kwargs import get_default_kwargs
-from espnet2.utils.nested_dict_action import NestedDictAction
-from espnet2.utils.types import int_or_none
-from espnet2.utils.types import str2bool
-from espnet2.utils.types import str_or_none
-
-enh_choices = ClassChoices(
-    name="enh",
-    classes=dict(tf_masking=TFMaskingNet, tasnet=TasNet, wpe_beamformer=BeamformerNet),
-    type_check=AbsEnhancement,
-    default="tf_masking",
-)
-frontend_choices = ClassChoices(
-    name="frontend",
-    classes=dict(default=DefaultFrontend),
-    type_check=AbsFrontend,
-    default="default",
-)
-specaug_choices = ClassChoices(
-    name="specaug",
-    classes=dict(specaug=SpecAug),
-    type_check=AbsSpecAug,
-    default=None,
-    optional=True,
-)
-normalize_choices = ClassChoices(
-    "normalize",
-    classes=dict(
-        global_mvn=GlobalMVN,
-        utterance_mvn=UtteranceMVN,
-    ),
-    type_check=AbsNormalize,
-    default="utterance_mvn",
-    optional=True,
-)
-encoder_choices = ClassChoices(
-    "encoder",
-    classes=dict(
-        transformer=TransformerEncoder,
-        vgg_rnn=VGGRNNEncoder,
-        rnn=RNNEncoder,
-    ),
-    type_check=AbsEncoder,
-    default="rnn",
-)
-decoder_choices = ClassChoices(
-    "decoder",
-    classes=dict(transformer=TransformerDecoder, rnn=RNNDecoder),
-    type_check=AbsDecoder,
-    default="rnn",
-)
-
-MAX_REFERENCE_NUM = 100
-
-
-class ASRTask(AbsTask):
-    # If you need more than one optimizers, change this value
-    num_optimizers: int = 1
-
-    # Add variable objects configurations
-    class_choices_list = [
-        # --enh and --enh_conf
-        enh_choices,
-        # --frontend and --frontend_conf
-        frontend_choices,
-        # --specaug and --specaug_conf
-        specaug_choices,
-        # --normalize and --normalize_conf
-        normalize_choices,
-        # --encoder and --encoder_conf
-        encoder_choices,
-        # --decoder and --decoder_conf
-        decoder_choices,
-    ]
-
-    # If you need to modify train() or eval() procedures, change Trainer class here
-    trainer = Trainer
-
-    @classmethod
-    def add_task_arguments(cls, parser: argparse.ArgumentParser):
-        group = parser.add_argument_group(description="Task related")
-
-        # NOTE(kamo): add_arguments(..., required=True) can't be used
-        # to provide --print_config mode. Instead of it, do as
-        required = parser.get_default("required")
-        required += ["token_list"]
-
-        group.add_argument(
-            "--token_list",
-            type=str_or_none,
-            default=None,
-            help="A text mapping int-id to token",
-        )
-        group.add_argument(
-            "--init",
-            type=lambda x: str_or_none(x.lower()),
-            default=None,
-            help="The initialization method",
-            choices=[
-                "chainer",
-                "xavier_uniform",
-                "xavier_normal",
-                "kaiming_uniform",
-                "kaiming_normal",
-                None,
-            ],
-        )
-
-        group.add_argument(
-            "--input_size",
-            type=int_or_none,
-            default=None,
-            help="The number of input dimension of the feature",
-        )
-
-        group.add_argument(
-            "--ctc_conf",
-            action=NestedDictAction,
-            default=get_default_kwargs(CTC),
-            help="The keyword arguments for CTC class.",
-        )
-        group.add_argument(
-            "--asr_model_conf",
-            action=NestedDictAction,
-            default=get_default_kwargs(ESPnetASRModel),
-            help="The keyword arguments for model class.",
-        )
-
-        group.add_argument(
-            "--enh_model_conf",
-            action=NestedDictAction,
-            default=get_default_kwargs(ESPnetEnhancementModel),
-            help="The keyword arguments for model class.",
-        )
-
-        group = parser.add_argument_group(description="Preprocess related")
-        group.add_argument(
-            "--use_preprocessor",
-            type=str2bool,
-            default=False,
-            help="Apply preprocessing to data or not",
-        )
-        group.add_argument(
-            "--token_type",
-            type=str,
-            default="bpe",
-            choices=["bpe", "char", "word", "phn"],
-            help="The text will be tokenized " "in the specified level token",
-        )
-        group.add_argument(
-            "--bpemodel",
-            type=str_or_none,
-            default=None,
-            help="The model file of sentencepiece",
-        )
-        parser.add_argument(
-            "--non_linguistic_symbols",
-            type=str_or_none,
-            help="non_linguistic_symbols file path",
-        )
-        parser.add_argument(
-            "--cleaner",
-            type=str_or_none,
-            choices=[None, "tacotron", "jaconv", "vietnamese"],
-            default=None,
-            help="Apply text cleaning",
-        )
-        parser.add_argument(
-            "--g2p",
-            type=str_or_none,
-            choices=g2p_choices,
-            default=None,
-            help="Specify g2p method if --token_type=phn",
-        )
-
-        for class_choices in cls.class_choices_list:
-            # Append --<name> and --<name>_conf.
-            # e.g. --encoder and --encoder_conf
-            class_choices.add_arguments(group)
-
-    @classmethod
-    def build_collate_fn(
-        cls, args: argparse.Namespace, train: bool
-    ) -> Callable[
-        [Collection[Tuple[str, Dict[str, np.ndarray]]]],
-        Tuple[List[str], Dict[str, torch.Tensor]],
-    ]:
-        assert check_argument_types()
-        # NOTE(kamo): int value = 0 is reserved by CTC-blank symbol
-        return CommonCollateFn(float_pad_value=0.0, int_pad_value=-1)
-
-    @classmethod
-    def build_preprocess_fn(
-        cls, args: argparse.Namespace, train: bool
-    ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]:
-        assert check_argument_types()
-        # TODO(Jing): ask Kamo if it ok to support several args,
-        # like text_name = 'text_ref1' and 'text_ref2'
-        if args.use_preprocessor:
-            retval = CommonPreprocessor_multi(
-                train=train,
-                token_type=args.token_type,
-                token_list=args.token_list,
-                bpemodel=args.bpemodel,
-                non_linguistic_symbols=args.non_linguistic_symbols,
-                text_name=["text_ref1", "text_ref2"],
-                text_cleaner=args.cleaner,
-                g2p_type=args.g2p,
-            )
-        else:
-            retval = None
-        assert check_return_type(retval)
-        return retval
-
-    @classmethod
-    def required_data_names(
-        cls, train: bool = True, inference: bool = False
-    ) -> Tuple[str, ...]:
-        if not inference:
-            retval = ("speech_mix", "speech_ref1", "text_ref1")
-        else:
-            # Recognition mode
-            retval = ("speech_mix",)
-        return retval
-
-    @classmethod
-    def optional_data_names(
-        cls, train: bool = True, inference: bool = False
-    ) -> Tuple[str, ...]:
-        retval = ["dereverb_ref"]
-        retval += ["speech_ref{}".format(n) for n in range(2, MAX_REFERENCE_NUM + 1)]
-        retval += ["text_ref{}".format(n) for n in range(2, MAX_REFERENCE_NUM + 1)]
-        retval += ["noise_ref{}".format(n) for n in range(1, MAX_REFERENCE_NUM + 1)]
-        retval = tuple(retval)
-        assert check_return_type(retval)
-        return retval
-
-    @classmethod
-    def build_model(cls, args: argparse.Namespace) -> ESPnetEnhASRModel:
-        assert check_argument_types()
-        if isinstance(args.token_list, str):
-            with open(args.token_list, encoding="utf-8") as f:
-                token_list = [line.rstrip() for line in f]
-
-            # Overwriting token_list to keep it as "portable".
-            args.token_list = list(token_list)
-        elif isinstance(args.token_list, (tuple, list)):
-            token_list = list(args.token_list)
-        else:
-            raise RuntimeError("token_list must be str or list")
-        vocab_size = len(token_list)
-        logging.info(f"Vocabulary size: {vocab_size }")
-
-        # 0. Build pre enhancement model
-        enh_model = enh_choices.get_class(args.enh)(**args.enh_conf)
-
-        # 1. frontend
-        if args.input_size is None:
-            # Extract features in the model
-            frontend_class = frontend_choices.get_class(args.frontend)
-            frontend = frontend_class(**args.frontend_conf)
-            input_size = frontend.output_size()
-        else:
-            # Give features from data-loader
-            args.frontend = None
-            args.frontend_conf = {}
-            frontend = None
-            input_size = args.input_size
-
-        # 2. Data augmentation for spectrogram
-        if args.specaug is not None:
-            specaug_class = specaug_choices.get_class(args.specaug)
-            specaug = specaug_class(**args.specaug_conf)
-        else:
-            specaug = None
-
-        # 3. Normalization layer
-        if args.normalize is not None:
-            normalize_class = normalize_choices.get_class(args.normalize)
-            normalize = normalize_class(**args.normalize_conf)
-        else:
-            normalize = None
-
-        # 4. Encoder
-        encoder_class = encoder_choices.get_class(args.encoder)
-        encoder = encoder_class(input_size=input_size, **args.encoder_conf)
-
-        # 5. Decoder
-        decoder_class = decoder_choices.get_class(args.decoder)
-
-        decoder = decoder_class(
-            vocab_size=vocab_size,
-            encoder_output_size=encoder.output_size(),
-            **args.decoder_conf,
-        )
-
-        # 6. CTC
-        ctc = CTC(
-            odim=vocab_size, encoder_output_size=encoder.output_size(), **args.ctc_conf
-        )
-
-        # 7. RNN-T Decoder (Not implemented)
-        rnnt_decoder = None
-
-        # 8. Build model
-        model = ESPnetEnhASRModel(
-            vocab_size=vocab_size,
-            enh=enh_model,
-            frontend=frontend,
-            specaug=specaug,
-            normalize=normalize,
-            encoder=encoder,
-            decoder=decoder,
-            ctc=ctc,
-            rnnt_decoder=rnnt_decoder,
-            token_list=token_list,
-            **args.asr_model_conf,
-        )
-
-        # FIXME(kamo): Should be done in model?
-        # 9. Initialize
-        if args.init is not None:
-            initialize(model, args.init)
-
-        assert check_return_type(model)
-        return model
diff --git a/espnet2/tasks/enh_s2t.py b/espnet2/tasks/enh_s2t.py
new file mode 100644
index 00000000000..d6a20bac700
--- /dev/null
+++ b/espnet2/tasks/enh_s2t.py
@@ -0,0 +1,475 @@
+import argparse
+import copy
+import logging
+from typing import Callable
+from typing import Collection
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+
+import numpy as np
+import torch
+from typeguard import check_argument_types
+from typeguard import check_return_type
+
+from espnet2.asr.ctc import CTC
+from espnet2.asr.espnet_model import ESPnetASRModel
+from espnet2.enh.espnet_enh_s2t_model import ESPnetEnhS2TModel
+from espnet2.enh.espnet_model import ESPnetEnhancementModel
+from espnet2.tasks.abs_task import AbsTask
+from espnet2.tasks.asr import ASRTask
+from espnet2.tasks.asr import decoder_choices as asr_decoder_choices_
+from espnet2.tasks.asr import encoder_choices as asr_encoder_choices_
+from espnet2.tasks.asr import frontend_choices
+from espnet2.tasks.asr import normalize_choices
+from espnet2.tasks.asr import postencoder_choices as asr_postencoder_choices_
+from espnet2.tasks.asr import preencoder_choices as asr_preencoder_choices_
+from espnet2.tasks.asr import specaug_choices
+from espnet2.tasks.enh import decoder_choices as enh_decoder_choices_
+from espnet2.tasks.enh import encoder_choices as enh_encoder_choices_
+from espnet2.tasks.enh import EnhancementTask
+from espnet2.tasks.enh import separator_choices as enh_separator_choices_
+from espnet2.tasks.st import decoder_choices as st_decoder_choices_
+from espnet2.tasks.st import encoder_choices as st_encoder_choices_
+from espnet2.tasks.st import extra_asr_decoder_choices as st_extra_asr_decoder_choices_
+from espnet2.tasks.st import extra_mt_decoder_choices as st_extra_mt_decoder_choices_
+from espnet2.tasks.st import postencoder_choices as st_postencoder_choices_
+from espnet2.tasks.st import preencoder_choices as st_preencoder_choices_
+from espnet2.tasks.st import STTask
+from espnet2.text.phoneme_tokenizer import g2p_choices
+from espnet2.torch_utils.initialize import initialize
+from espnet2.train.collate_fn import CommonCollateFn
+from espnet2.train.preprocessor import CommonPreprocessor_multi
+from espnet2.train.preprocessor import MutliTokenizerCommonPreprocessor
+from espnet2.train.trainer import Trainer
+from espnet2.utils.get_default_kwargs import get_default_kwargs
+from espnet2.utils.nested_dict_action import NestedDictAction
+from espnet2.utils.types import int_or_none
+from espnet2.utils.types import str2bool
+from espnet2.utils.types import str_or_none
+
+
+# Enhancement
+enh_encoder_choices = copy.deepcopy(enh_encoder_choices_)
+enh_encoder_choices.name = "enh_encoder"
+enh_decoder_choices = copy.deepcopy(enh_decoder_choices_)
+enh_decoder_choices.name = "enh_decoder"
+enh_separator_choices = copy.deepcopy(enh_separator_choices_)
+enh_separator_choices.name = "enh_separator"
+
+# ASR (also SLU)
+asr_preencoder_choices = copy.deepcopy(asr_preencoder_choices_)
+asr_preencoder_choices.name = "asr_preencoder"
+asr_encoder_choices = copy.deepcopy(asr_encoder_choices_)
+asr_encoder_choices.name = "asr_encoder"
+asr_postencoder_choices = copy.deepcopy(asr_postencoder_choices_)
+asr_postencoder_choices.name = "asr_postencoder"
+asr_decoder_choices = copy.deepcopy(asr_decoder_choices_)
+asr_decoder_choices.name = "asr_decoder"
+
+# ST
+st_preencoder_choices = copy.deepcopy(st_preencoder_choices_)
+st_preencoder_choices.name = "st_preencoder"
+st_encoder_choices = copy.deepcopy(st_encoder_choices_)
+st_encoder_choices.name = "st_encoder"
+st_postencoder_choices = copy.deepcopy(st_postencoder_choices_)
+st_postencoder_choices.name = "st_postencoder"
+st_decoder_choices = copy.deepcopy(st_decoder_choices_)
+st_decoder_choices.name = "st_decoder"
+st_extra_asr_decoder_choices = copy.deepcopy(st_extra_asr_decoder_choices_)
+st_extra_asr_decoder_choices.name = "st_extra_asr_decoder"
+st_extra_mt_decoder_choices = copy.deepcopy(st_extra_mt_decoder_choices_)
+st_extra_mt_decoder_choices.name = "st_extra_mt_decoder"
+
+MAX_REFERENCE_NUM = 100
+
+name2task = dict(
+    enh=EnhancementTask,
+    asr=ASRTask,
+    st=STTask,
+)
+
+# More can be added to the following attributes
+enh_attributes = [
+    "encoder",
+    "encoder_conf",
+    "separator",
+    "separator_conf",
+    "decoder",
+    "decoder_conf",
+    "criterions",
+]
+
+asr_attributes = [
+    "token_list",
+    "input_size",
+    "frontend",
+    "frontend_conf",
+    "specaug",
+    "specaug_conf",
+    "normalize",
+    "normalize_conf",
+    "preencoder",
+    "preencoder_conf",
+    "encoder",
+    "encoder_conf",
+    "postencoder",
+    "postencoder_conf",
+    "decoder",
+    "decoder_conf",
+    "ctc_conf",
+]
+
+st_attributes = [
+    "token_list",
+    "src_token_list",
+    "input_size",
+    "frontend",
+    "frontend_conf",
+    "specaug",
+    "specaug_conf",
+    "normalize",
+    "normalize_conf",
+    "preencoder",
+    "preencoder_conf",
+    "encoder",
+    "encoder_conf",
+    "postencoder",
+    "postencoder_conf",
+    "decoder",
+    "decoder_conf",
+    "ctc_conf",
+    "extra_asr_decoder",
+    "extra_asr_decoder_conf",
+    "extra_mt_decoder",
+    "extra_mt_decoder_conf",
+]
+
+
+class EnhS2TTask(AbsTask):
+    # If you need more than one optimizers, change this value
+    num_optimizers: int = 1
+
+    # Add variable objects configurations
+    class_choices_list = [
+        # --enh_encoder and --enh_encoder_conf
+        enh_encoder_choices,
+        # --enh_separator and --enh_separator_conf
+        enh_separator_choices,
+        # --enh_decoder and --enh_decoder_conf
+        enh_decoder_choices,
+        # --frontend and --frontend_conf
+        frontend_choices,
+        # --specaug and --specaug_conf
+        specaug_choices,
+        # --normalize and --normalize_conf
+        normalize_choices,
+        # --asr_preencoder and --asr_preencoder_conf
+        asr_preencoder_choices,
+        # --asr_encoder and --asr_encoder_conf
+        asr_encoder_choices,
+        # --asr_postencoder and --asr_postencoder_conf
+        asr_postencoder_choices,
+        # --asr_decoder and --asr_decoder_conf
+        asr_decoder_choices,
+        # --st_preencoder and --st_preencoder_conf
+        st_preencoder_choices,
+        # --st_encoder and --st_encoder_conf
+        st_encoder_choices,
+        # --st_postencoder and --st_postencoder_conf
+        st_postencoder_choices,
+        # --st_decoder and --st_decoder_conf
+        st_decoder_choices,
+        # --st_extra_asr_decoder and --st_extra_asr_decoder_conf
+        st_extra_asr_decoder_choices,
+        # --st_extra_mt_decoder and --st_extra_mt_decoder_conf
+        st_extra_mt_decoder_choices,
+    ]
+
+    # If you need to modify train() or eval() procedures, change Trainer class here
+    trainer = Trainer
+
+    @classmethod
+    def add_task_arguments(cls, parser: argparse.ArgumentParser):
+        group = parser.add_argument_group(description="Task related")
+
+        # NOTE(kamo): add_arguments(..., required=True) can't be used
+        # to provide --print_config mode. Instead of it, do as
+        required = parser.get_default("required")
+        required += ["token_list"]
+
+        group.add_argument(
+            "--token_list",
+            type=str_or_none,
+            default=None,
+            help="A text mapping int-id to token",
+        )
+        group.add_argument(
+            "--src_token_list",
+            type=str_or_none,
+            default=None,
+            help="A text mapping int-id to token (for source language)",
+        )
+        group.add_argument(
+            "--init",
+            type=lambda x: str_or_none(x.lower()),
+            default=None,
+            help="The initialization method",
+            choices=[
+                "chainer",
+                "xavier_uniform",
+                "xavier_normal",
+                "kaiming_uniform",
+                "kaiming_normal",
+                None,
+            ],
+        )
+
+        group.add_argument(
+            "--input_size",
+            type=int_or_none,
+            default=None,
+            help="The number of input dimension of the feature",
+        )
+
+        group.add_argument(
+            "--ctc_conf",
+            action=NestedDictAction,
+            default=get_default_kwargs(CTC),
+            help="The keyword arguments for CTC class.",
+        )
+
+        group.add_argument(
+            "--enh_criterions",
+            action=NestedDictAction,
+            default=[
+                {
+                    "name": "si_snr",
+                    "conf": {},
+                    "wrapper": "fixed_order",
+                    "wrapper_conf": {},
+                },
+            ],
+            help="The criterions binded with the loss wrappers.",
+        )
+
+        group.add_argument(
+            "--enh_model_conf",
+            action=NestedDictAction,
+            default=get_default_kwargs(ESPnetEnhancementModel),
+            help="The keyword arguments for enh submodel class.",
+        )
+
+        group.add_argument(
+            "--asr_model_conf",
+            action=NestedDictAction,
+            default=get_default_kwargs(ESPnetASRModel),
+            help="The keyword arguments for asr submodel class.",
+        )
+
+        group.add_argument(
+            "--st_model_conf",
+            action=NestedDictAction,
+            default=get_default_kwargs(ESPnetEnhancementModel),
+            help="The keyword arguments for st submodel class.",
+        )
+
+        group.add_argument(
+            "--subtask_series",
+            type=str,
+            nargs="+",
+            default=("enh", "asr"),
+            choices=["enh", "asr", "st"],
+            help="The series of subtasks in the pipeline.",
+        )
+
+        group.add_argument(
+            "--model_conf",
+            action=NestedDictAction,
+            default=get_default_kwargs(ESPnetEnhS2TModel),
+            help="The keyword arguments for model class.",
+        )
+
+        group = parser.add_argument_group(description="Preprocess related")
+        group.add_argument(
+            "--use_preprocessor",
+            type=str2bool,
+            default=False,
+            help="Apply preprocessing to data or not",
+        )
+        group.add_argument(
+            "--token_type",
+            type=str,
+            default="bpe",
+            choices=["bpe", "char", "word", "phn"],
+            help="The text will be tokenized " "in the specified level token",
+        )
+        group.add_argument(
+            "--bpemodel",
+            type=str_or_none,
+            default=None,
+            help="The model file of sentencepiece",
+        )
+        group.add_argument(
+            "--src_token_type",
+            type=str,
+            default="bpe",
+            choices=["bpe", "char", "word", "phn"],
+            help="The source text will be tokenized " "in the specified level token",
+        )
+        group.add_argument(
+            "--src_bpemodel",
+            type=str_or_none,
+            default=None,
+            help="The model file of sentencepiece (for source language)",
+        )
+        parser.add_argument(
+            "--non_linguistic_symbols",
+            type=str_or_none,
+            help="non_linguistic_symbols file path",
+        )
+        parser.add_argument(
+            "--cleaner",
+            type=str_or_none,
+            choices=[None, "tacotron", "jaconv", "vietnamese"],
+            default=None,
+            help="Apply text cleaning",
+        )
+        parser.add_argument(
+            "--g2p",
+            type=str_or_none,
+            choices=g2p_choices,
+            default=None,
+            help="Specify g2p method if --token_type=phn",
+        )
+
+        for class_choices in cls.class_choices_list:
+            # Append --<name> and --<name>_conf.
+            # e.g. --encoder and --encoder_conf
+            class_choices.add_arguments(group)
+
+    @classmethod
+    def build_collate_fn(
+        cls, args: argparse.Namespace, train: bool
+    ) -> Callable[
+        [Collection[Tuple[str, Dict[str, np.ndarray]]]],
+        Tuple[List[str], Dict[str, torch.Tensor]],
+    ]:
+        assert check_argument_types()
+        # NOTE(kamo): int value = 0 is reserved by CTC-blank symbol
+        return CommonCollateFn(float_pad_value=0.0, int_pad_value=-1)
+
+    @classmethod
+    def build_preprocess_fn(
+        cls, args: argparse.Namespace, train: bool
+    ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]:
+        assert check_argument_types()
+        if args.use_preprocessor:
+            if "st" in args.subtask_series:
+                retval = MutliTokenizerCommonPreprocessor(
+                    train=train,
+                    token_type=[args.token_type, args.src_token_type],
+                    token_list=[args.token_list, args.src_token_list],
+                    bpemodel=[args.bpemodel, args.src_bpemodel],
+                    non_linguistic_symbols=args.non_linguistic_symbols,
+                    text_cleaner=args.cleaner,
+                    g2p_type=args.g2p,
+                    # NOTE(kamo): Check attribute existence for backward compatibility
+                    rir_scp=args.rir_scp if hasattr(args, "rir_scp") else None,
+                    rir_apply_prob=args.rir_apply_prob
+                    if hasattr(args, "rir_apply_prob")
+                    else 1.0,
+                    noise_scp=args.noise_scp if hasattr(args, "noise_scp") else None,
+                    noise_apply_prob=args.noise_apply_prob
+                    if hasattr(args, "noise_apply_prob")
+                    else 1.0,
+                    noise_db_range=args.noise_db_range
+                    if hasattr(args, "noise_db_range")
+                    else "13_15",
+                    speech_volume_normalize=args.speech_volume_normalize
+                    if hasattr(args, "speech_volume_normalize")
+                    else None,
+                    speech_name="speech",
+                    text_name=["text", "src_text"],
+                )
+            else:
+                retval = CommonPreprocessor_multi(
+                    train=train,
+                    token_type=args.token_type,
+                    token_list=args.token_list,
+                    bpemodel=args.bpemodel,
+                    non_linguistic_symbols=args.non_linguistic_symbols,
+                    text_name=["text"],
+                    text_cleaner=args.cleaner,
+                    g2p_type=args.g2p,
+                )
+        else:
+            retval = None
+        assert check_return_type(retval)
+        return retval
+
+    @classmethod
+    def required_data_names(
+        cls, train: bool = True, inference: bool = False
+    ) -> Tuple[str, ...]:
+        if not inference:
+            retval = ("speech", "speech_ref1", "text")
+        else:
+            # Recognition mode
+            retval = ("speech",)
+        return retval
+
+    @classmethod
+    def optional_data_names(
+        cls, train: bool = True, inference: bool = False
+    ) -> Tuple[str, ...]:
+        retval = ["dereverb_ref1"]
+        retval += ["speech_ref{}".format(n) for n in range(2, MAX_REFERENCE_NUM + 1)]
+        retval += ["noise_ref{}".format(n) for n in range(1, MAX_REFERENCE_NUM + 1)]
+        retval += ["src_text"]
+        retval = tuple(retval)
+        assert check_return_type(retval)
+        return retval
+
+    @classmethod
+    def build_model(cls, args: argparse.Namespace) -> ESPnetEnhS2TModel:
+        assert check_argument_types()
+
+        # Build submodels in the order of subtask_series
+        model_conf = args.model_conf.copy()
+        for _, subtask in enumerate(args.subtask_series):
+            subtask_conf = dict(
+                init=None, model_conf=eval(f"args.{subtask}_model_conf")
+            )
+
+            for attr in eval(f"{subtask}_attributes"):
+                subtask_conf[attr] = (
+                    getattr(args, subtask + "_" + attr, None)
+                    if getattr(args, subtask + "_" + attr, None) is not None
+                    else getattr(args, attr, None)
+                )
+
+            if subtask in ["asr", "st"]:
+                m_subtask = "s2t"
+            elif subtask in ["enh"]:
+                m_subtask = subtask
+            else:
+                raise ValueError(f"{subtask} not supported.")
+
+            logging.info(f"Building {subtask} task model, using config: {subtask_conf}")
+
+            model_conf[f"{m_subtask}_model"] = name2task[subtask].build_model(
+                argparse.Namespace(**subtask_conf)
+            )
+
+        # 8. Build model
+        model = ESPnetEnhS2TModel(**model_conf)
+
+        # FIXME(kamo): Should be done in model?
+        # 9. Initialize
+        if args.init is not None:
+            initialize(model, args.init)
+
+        assert check_return_type(model)
+        return model
diff --git a/espnet2/tasks/st.py b/espnet2/tasks/st.py
index 182a335cc56..2b992f0be4e 100644
--- a/espnet2/tasks/st.py
+++ b/espnet2/tasks/st.py
@@ -206,7 +206,7 @@ def add_task_arguments(cls, parser: argparse.ArgumentParser):
         # NOTE(kamo): add_arguments(..., required=True) can't be used
         # to provide --print_config mode. Instead of it, do as
         required = parser.get_default("required")
-        required += ["src_token_list", "token_list"]
+        required += ["token_list"]
 
         group.add_argument(
             "--token_list",
diff --git a/espnet2/train/trainer.py b/espnet2/train/trainer.py
index 766651ddbaa..304d3329264 100644
--- a/espnet2/train/trainer.py
+++ b/espnet2/train/trainer.py
@@ -502,7 +502,7 @@ def train_one_epoch(
         iterator_stop = torch.tensor(0).to("cuda" if ngpu > 0 else "cpu")
 
         start_time = time.perf_counter()
-        for iiter, (_, batch) in enumerate(
+        for iiter, (utt_id, batch) in enumerate(
             reporter.measure_iter_time(iterator, "iter_time"), 1
         ):
             assert isinstance(batch, dict), type(batch)
@@ -512,6 +512,8 @@ def train_one_epoch(
                 if iterator_stop > 0:
                     break
 
+            batch["utt_id"] = utt_id
+
             batch = to_device(batch, "cuda" if ngpu > 0 else "cpu")
             if no_forward_run:
                 all_steps_are_invalid = False
@@ -705,13 +707,15 @@ def validate_one_epoch(
         # [For distributed] Because iteration counts are not always equals between
         # processes, send stop-flag to the other processes if iterator is finished
         iterator_stop = torch.tensor(0).to("cuda" if ngpu > 0 else "cpu")
-        for (_, batch) in iterator:
+        for (utt_id, batch) in iterator:
             assert isinstance(batch, dict), type(batch)
             if distributed:
                 torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM)
                 if iterator_stop > 0:
                     break
 
+            batch["utt_id"] = utt_id
+
             batch = to_device(batch, "cuda" if ngpu > 0 else "cpu")
             if no_forward_run:
                 continue
@@ -763,6 +767,9 @@ def plot_attention(
                 len(next(iter(batch.values()))),
                 len(ids),
             )
+
+            batch["utt_id"] = ids
+
             batch = to_device(batch, "cuda" if ngpu > 0 else "cpu")
             if no_forward_run:
                 continue
diff --git a/espnet2/tts/espnet_model.py b/espnet2/tts/espnet_model.py
index 986c7d029a0..e09c4a35a55 100644
--- a/espnet2/tts/espnet_model.py
+++ b/espnet2/tts/espnet_model.py
@@ -67,6 +67,7 @@ def forward(
         spembs: Optional[torch.Tensor] = None,
         sids: Optional[torch.Tensor] = None,
         lids: Optional[torch.Tensor] = None,
+        **kwargs,
     ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
         """Caclualte outputs and return the loss tensor.
 
@@ -84,6 +85,7 @@ def forward(
             spembs (Optional[Tensor]): Speaker embedding tensor (B, D).
             sids (Optional[Tensor]): Speaker ID tensor (B, 1).
             lids (Optional[Tensor]): Language ID tensor (B, 1).
+            kwargs: "utt_id" is among the input.
 
         Returns:
             Tensor: Loss scalar tensor.
@@ -166,6 +168,7 @@ def collect_feats(
         spembs: Optional[torch.Tensor] = None,
         sids: Optional[torch.Tensor] = None,
         lids: Optional[torch.Tensor] = None,
+        **kwargs,
     ) -> Dict[str, torch.Tensor]:
         """Caclualte features and return them as a dict.
 
diff --git a/setup.py b/setup.py
index 67a05ec1c99..671868219ed 100644
--- a/setup.py
+++ b/setup.py
@@ -36,6 +36,7 @@
         # ENH
         "ci_sdr",
         "pytorch_wpe",
+        "fast-bss-eval==0.1.3",
     ],
     # train: The modules invoked when training only.
     "train": [
@@ -85,12 +86,13 @@
         "hacking>=2.0.0",
         "mock>=2.0.0",
         "pycodestyle",
-        "jsondiff>=1.2.0",
+        "jsondiff<2.0.0,>=1.2.0",
         "flake8>=3.7.8",
         "flake8-docstrings>=1.3.1",
         "black",
     ],
     "doc": [
+        "Jinja2<3.1",
         "Sphinx==2.1.2",
         "sphinx-rtd-theme>=0.2.4",
         "sphinx-argparse>=0.2.5",
diff --git a/test/espnet2/asr/frontend/test_s3prl.py b/test/espnet2/asr/frontend/test_s3prl.py
index 77564a21a91..0bfebb823b3 100644
--- a/test/espnet2/asr/frontend/test_s3prl.py
+++ b/test/espnet2/asr/frontend/test_s3prl.py
@@ -1,12 +1,22 @@
 from distutils.version import LooseVersion
-import os
 
 import torch
 
+from espnet2.asr.frontend.s3prl import S3prlFrontend
+
 is_torch_1_7_plus = LooseVersion(torch.__version__) >= LooseVersion("1.7.0")
 
-if is_torch_1_7_plus:
-    from s3prl.upstream.interfaces import Featurizer
+
+def test_frontend_init():
+    if not is_torch_1_7_plus:
+        return
+
+    frontend = S3prlFrontend(
+        fs=16000,
+        frontend_conf=dict(upstream="mel"),
+    )
+    assert frontend.frontend_type == "s3prl"
+    assert frontend.output_dim > 0
 
 
 def test_frontend_output_size():
@@ -14,28 +24,28 @@ def test_frontend_output_size():
     if not is_torch_1_7_plus:
         return
 
-    s3prl_path = None
-    python_path_list = os.environ.get("PYTHONPATH", "(None)").split(":")
-    for p in python_path_list:
-        if p.endswith("s3prl"):
-            s3prl_path = p
-            break
-    assert s3prl_path is not None
-
-    s3prl_upstream = torch.hub.load(
-        s3prl_path,
-        "mel",
-        source="local",
-    ).to("cpu")
-
-    feature_selection = "last_hidden_state"
-    s3prl_featurizer = Featurizer(
-        upstream=s3prl_upstream,
-        feature_selection=feature_selection,
-        upstream_device="cpu",
+    frontend = S3prlFrontend(
+        fs=16000,
+        frontend_conf=dict(upstream="mel"),
+        download_dir="./hub",
     )
 
-    wavs = [torch.randn(1600)]
-    feats = s3prl_upstream(wavs)
-    feats = s3prl_featurizer(wavs, feats)
-    assert feats[0].shape[-1] == 80
+    wavs = torch.randn(2, 1600)
+    lengths = torch.LongTensor([1600, 1600])
+    feats, _ = frontend(wavs, lengths)
+    assert feats.shape[-1] == frontend.output_dim
+
+
+def test_frontend_backward():
+    if not is_torch_1_7_plus:
+        return
+
+    frontend = S3prlFrontend(
+        fs=16000,
+        frontend_conf=dict(upstream="mel"),
+        download_dir="./hub",
+    )
+    wavs = torch.randn(2, 1600, requires_grad=True)
+    lengths = torch.LongTensor([1600, 1600])
+    feats, f_lengths = frontend(wavs, lengths)
+    feats.sum().backward()
diff --git a/test/espnet2/bin/test_asr_inference.py b/test/espnet2/bin/test_asr_inference.py
index 9cd9b2a232f..7e7a18d9a1e 100644
--- a/test/espnet2/bin/test_asr_inference.py
+++ b/test/espnet2/bin/test_asr_inference.py
@@ -10,6 +10,7 @@
 from espnet2.bin.asr_inference import main
 from espnet2.bin.asr_inference import Speech2Text
 from espnet2.tasks.asr import ASRTask
+from espnet2.tasks.enh_s2t import EnhS2TTask
 from espnet2.tasks.lm import LMTask
 
 
@@ -118,3 +119,38 @@ def test_Speech2Text_streaming(asr_config_file_streaming, lm_config_file):
         assert isinstance(token[0], str)
         assert isinstance(token_int[0], int)
         assert isinstance(hyp, Hypothesis)
+
+
+@pytest.fixture()
+def enh_asr_config_file(tmp_path: Path, token_list):
+    # Write default configuration file
+    EnhS2TTask.main(
+        cmd=[
+            "--dry_run",
+            "true",
+            "--output_dir",
+            str(tmp_path / "enh_asr"),
+            "--token_list",
+            str(token_list),
+            "--token_type",
+            "char",
+        ]
+    )
+    return tmp_path / "enh_asr" / "config.yaml"
+
+
+@pytest.mark.execution_timeout(5)
+def test_EnhS2T_Speech2Text(enh_asr_config_file, lm_config_file):
+    speech2text = Speech2Text(
+        asr_train_config=enh_asr_config_file,
+        lm_train_config=lm_config_file,
+        beam_size=1,
+        enh_s2t_task=True,
+    )
+    speech = np.random.randn(48000)
+    results = speech2text(speech)
+    for text, token, token_int, hyp in results:
+        assert isinstance(text, str)
+        assert isinstance(token[0], str)
+        assert isinstance(token_int[0], int)
+        assert isinstance(hyp, Hypothesis)
diff --git a/test/espnet2/bin/test_enh_inference.py b/test/espnet2/bin/test_enh_inference.py
index 5150e25823e..2bad3cae4ea 100644
--- a/test/espnet2/bin/test_enh_inference.py
+++ b/test/espnet2/bin/test_enh_inference.py
@@ -1,13 +1,19 @@
 from argparse import ArgumentParser
 from pathlib import Path
+import string
 
 import pytest
 import torch
+import yaml
 
 from espnet2.bin.enh_inference import get_parser
 from espnet2.bin.enh_inference import main
 from espnet2.bin.enh_inference import SeparateSpeech
+from espnet2.enh.encoder.stft_encoder import STFTEncoder
 from espnet2.tasks.enh import EnhancementTask
+from espnet2.tasks.enh_s2t import EnhS2TTask
+from espnet2.utils.get_default_kwargs import get_default_kwargs
+from espnet2.utils.yaml_no_alias_safe_dump import yaml_no_alias_safe_dump
 
 
 def test_get_parser():
@@ -27,10 +33,20 @@ def config_file(tmp_path: Path):
             "--dry_run",
             "true",
             "--output_dir",
-            str(tmp_path),
+            str(tmp_path / "enh"),
         ]
     )
-    return tmp_path / "config.yaml"
+
+    with open(tmp_path / "enh" / "config.yaml", "r") as f:
+        args = yaml.safe_load(f)
+
+    if args["encoder"] == "stft" and len(args["encoder_conf"]) == 0:
+        args["encoder_conf"] = get_default_kwargs(STFTEncoder)
+
+    with open(tmp_path / "enh" / "config.yaml", "w") as f:
+        yaml_no_alias_safe_dump(args, f, indent=4, sort_keys=False)
+
+    return tmp_path / "enh" / "config.yaml"
 
 
 @pytest.mark.execution_timeout(5)
@@ -50,3 +66,178 @@ def test_SeparateSpeech(
     )
     wav = torch.rand(batch_size, input_size)
     separate_speech(wav, fs=8000)
+
+
+@pytest.fixture()
+def enh_inference_config(tmp_path: Path):
+    # Write default configuration file
+    args = {
+        "encoder": "stft",
+        "encoder_conf": {"n_fft": 64, "hop_length": 32},
+        "decoder": "stft",
+        "decoder_conf": {"n_fft": 64, "hop_length": 32},
+    }
+    (tmp_path / "enh").mkdir(parents=True, exist_ok=True)
+    with open(tmp_path / "enh" / "inference.yaml", "w") as f:
+        yaml_no_alias_safe_dump(args, f, indent=4, sort_keys=False)
+    return tmp_path / "enh" / "inference.yaml"
+
+
+@pytest.fixture()
+def invalid_enh_inference_config(tmp_path: Path):
+    # Write default configuration file
+    args = {
+        "encoder": "stft",
+        "encoder_conf": {"n_fft": 64, "hop_length": 32},
+        "xxx": "invalid",
+    }
+    (tmp_path / "enh").mkdir(parents=True, exist_ok=True)
+    with open(tmp_path / "enh" / "invalid_inference.yaml", "w") as f:
+        yaml_no_alias_safe_dump(args, f, indent=4, sort_keys=False)
+    return tmp_path / "enh" / "invalid_inference.yaml"
+
+
+@pytest.mark.execution_timeout(5)
+def test_SeparateSpeech_with_inference_config(config_file, enh_inference_config):
+    separate_speech = SeparateSpeech(
+        train_config=config_file, inference_config=enh_inference_config
+    )
+    wav = torch.rand(2, 16000)
+    separate_speech(wav, fs=8000)
+
+
+def test_SeparateSpeech_invalid_inference_config(
+    enh_inference_config, invalid_enh_inference_config
+):
+    with pytest.raises(AssertionError):
+        SeparateSpeech(
+            train_config=None, model_file=None, inference_config=enh_inference_config
+        )
+
+    with pytest.raises(AssertionError):
+        SeparateSpeech(train_config=None, inference_config=invalid_enh_inference_config)
+
+
+@pytest.fixture()
+def token_list(tmp_path: Path):
+    with (tmp_path / "tokens.txt").open("w") as f:
+        f.write("<blank>\n")
+        for c in string.ascii_letters:
+            f.write(f"{c}\n")
+        f.write("<unk>\n")
+        f.write("<sos/eos>\n")
+    return tmp_path / "tokens.txt"
+
+
+@pytest.fixture()
+def enh_s2t_config_file(tmp_path: Path, token_list):
+    # Write default configuration file
+    EnhS2TTask.main(
+        cmd=[
+            "--dry_run",
+            "true",
+            "--output_dir",
+            str(tmp_path / "enh_s2t"),
+            "--token_list",
+            str(token_list),
+            "--token_type",
+            "char",
+        ]
+    )
+
+    with open(tmp_path / "enh_s2t" / "config.yaml", "r") as f:
+        args = yaml.safe_load(f)
+
+    if args["enh_encoder"] == "stft" and len(args["enh_encoder_conf"]) == 0:
+        args["enh_encoder_conf"] = get_default_kwargs(STFTEncoder)
+
+    with open(tmp_path / "enh_s2t" / "config.yaml", "w") as f:
+        yaml_no_alias_safe_dump(args, f, indent=4, sort_keys=False)
+
+    return tmp_path / "enh_s2t" / "config.yaml"
+
+
+@pytest.mark.execution_timeout(5)
+@pytest.mark.parametrize("batch_size", [1, 2])
+@pytest.mark.parametrize(
+    "input_size, segment_size, hop_size, normalize_segment_scale",
+    [(16000, None, None, False), (35000, 2.4, 0.8, False), (35000, 2.4, 0.8, True)],
+)
+def test_enh_s2t_SeparateSpeech(
+    enh_s2t_config_file,
+    batch_size,
+    input_size,
+    segment_size,
+    hop_size,
+    normalize_segment_scale,
+):
+    separate_speech = SeparateSpeech(
+        train_config=enh_s2t_config_file,
+        segment_size=segment_size,
+        hop_size=hop_size,
+        normalize_segment_scale=normalize_segment_scale,
+        enh_s2t_task=True,
+    )
+    wav = torch.rand(batch_size, input_size)
+    separate_speech(wav, fs=8000)
+
+
+@pytest.fixture()
+def enh_s2t_inference_config(tmp_path: Path):
+    # Write default configuration file
+    args = {
+        "enh_encoder": "stft",
+        "enh_encoder_conf": {"n_fft": 64, "hop_length": 32},
+        "enh_decoder": "stft",
+        "enh_decoder_conf": {"n_fft": 64, "hop_length": 32},
+    }
+    (tmp_path / "enh_s2t").mkdir(parents=True, exist_ok=True)
+    with open(tmp_path / "enh_s2t" / "inference.yaml", "w") as f:
+        yaml_no_alias_safe_dump(args, f, indent=4, sort_keys=False)
+    return tmp_path / "enh_s2t" / "inference.yaml"
+
+
+@pytest.fixture()
+def invalid_enh_s2t_inference_config(tmp_path: Path):
+    # Write default configuration file
+    args = {
+        "enh_encoder": "stft",
+        "enh_encoder_conf": {"n_fft": 64, "hop_length": 32},
+        "xxx": "invalid",
+    }
+    (tmp_path / "enh_s2t").mkdir(parents=True, exist_ok=True)
+    with open(tmp_path / "enh_s2t" / "invalid_inference.yaml", "w") as f:
+        yaml_no_alias_safe_dump(args, f, indent=4, sort_keys=False)
+    return tmp_path / "enh_s2t" / "invalid_inference.yaml"
+
+
+@pytest.mark.execution_timeout(5)
+def test_enh_s2t_SeparateSpeech_with_inference_config(
+    enh_s2t_config_file, enh_s2t_inference_config
+):
+    separate_speech = SeparateSpeech(
+        train_config=enh_s2t_config_file,
+        inference_config=enh_s2t_inference_config,
+        enh_s2t_task=True,
+    )
+    wav = torch.rand(2, 16000)
+    separate_speech(wav, fs=8000)
+
+
+def test_enh_s2t_SeparateSpeech_invalid_inference_config(
+    enh_s2t_inference_config, invalid_enh_s2t_inference_config
+):
+    with pytest.raises(AssertionError):
+        SeparateSpeech(
+            train_config=None,
+            model_file=None,
+            inference_config=enh_s2t_inference_config,
+            enh_s2t_task=True,
+        )
+
+    with pytest.raises(AssertionError):
+        SeparateSpeech(
+            train_config=None,
+            inference_config=invalid_enh_s2t_inference_config,
+            enh_s2t_task=True,
+        )
diff --git a/test/espnet2/bin/test_enh_s2t_train.py b/test/espnet2/bin/test_enh_s2t_train.py
new file mode 100644
index 00000000000..2cd4fe6f94f
--- /dev/null
+++ b/test/espnet2/bin/test_enh_s2t_train.py
@@ -0,0 +1,15 @@
+from argparse import ArgumentParser
+
+import pytest
+
+from espnet2.bin.enh_s2t_train import get_parser
+from espnet2.bin.enh_s2t_train import main
+
+
+def test_get_parser():
+    assert isinstance(get_parser(), ArgumentParser)
+
+
+def test_main():
+    with pytest.raises(SystemExit):
+        main()
diff --git a/test/espnet2/bin/test_st_inference.py b/test/espnet2/bin/test_st_inference.py
new file mode 100644
index 00000000000..3910479456b
--- /dev/null
+++ b/test/espnet2/bin/test_st_inference.py
@@ -0,0 +1,75 @@
+from argparse import ArgumentParser
+from pathlib import Path
+import string
+
+import numpy as np
+import pytest
+
+from espnet.nets.beam_search import Hypothesis
+from espnet2.bin.st_inference import get_parser
+from espnet2.bin.st_inference import main
+from espnet2.bin.st_inference import Speech2Text
+from espnet2.tasks.st import STTask
+
+
+def test_get_parser():
+    assert isinstance(get_parser(), ArgumentParser)
+
+
+def test_main():
+    with pytest.raises(SystemExit):
+        main()
+
+
+@pytest.fixture()
+def token_list(tmp_path: Path):
+    with (tmp_path / "tokens.txt").open("w") as f:
+        f.write("<blank>\n")
+        for c in string.ascii_letters:
+            f.write(f"{c}\n")
+        f.write("<unk>\n")
+        f.write("<sos/eos>\n")
+    return tmp_path / "tokens.txt"
+
+
+@pytest.fixture()
+def src_token_list(tmp_path: Path):
+    with (tmp_path / "src_tokens.txt").open("w") as f:
+        f.write("<blank>\n")
+        for c in string.ascii_letters:
+            f.write(f"{c}\n")
+        f.write("<unk>\n")
+        f.write("<sos/eos>\n")
+    return tmp_path / "src_tokens.txt"
+
+
+@pytest.fixture()
+def st_config_file(tmp_path: Path, token_list, src_token_list):
+    # Write default configuration file
+    STTask.main(
+        cmd=[
+            "--dry_run",
+            "true",
+            "--output_dir",
+            str(tmp_path / "st"),
+            "--token_list",
+            str(token_list),
+            "--src_token_list",
+            str(src_token_list),
+            "--token_type",
+            "char",
+        ]
+    )
+    return tmp_path / "st" / "config.yaml"
+
+
+@pytest.mark.execution_timeout(5)
+def test_Speech2Text(st_config_file):
+    speech2text = Speech2Text(st_train_config=st_config_file, beam_size=1)
+    speech = np.random.randn(1000)
+    results = speech2text(speech)
+    for text, token, token_int, hyp in results:
+        assert isinstance(text, str)
+        assert isinstance(token[0], str)
+        assert isinstance(token_int[0], int)
+        assert isinstance(hyp, Hypothesis)
diff --git a/test/espnet2/bin/test_st_train.py b/test/espnet2/bin/test_st_train.py
new file mode 100644
index 00000000000..5be899f0a38
--- /dev/null
+++ b/test/espnet2/bin/test_st_train.py
@@ -0,0 +1,15 @@
+from argparse import ArgumentParser
+
+import pytest
+
+from espnet2.bin.st_train import get_parser
+from espnet2.bin.st_train import main
+
+
+def test_get_parser():
+    assert isinstance(get_parser(), ArgumentParser)
+
+
+def test_main():
+    with pytest.raises(SystemExit):
+        main()
diff --git a/test/espnet2/enh/layers/test_complex_utils.py b/test/espnet2/enh/layers/test_complex_utils.py
index ca988a7bd2b..e566f3aea76 100644
--- a/test/espnet2/enh/layers/test_complex_utils.py
+++ b/test/espnet2/enh/layers/test_complex_utils.py
@@ -52,18 +52,16 @@ def test_cat(dim):
         assert complex_module.allclose(ret, ret2)
 
 
-@pytest.mark.parametrize("dim", [0, 1, 2])
+@pytest.mark.parametrize("dim", [None, 0, 1, 2])
 @pytest.mark.skipif(not is_torch_1_9_plus, reason="Require torch 1.9.0+")
 def test_complex_norm(dim):
     mat = ComplexTensor(torch.rand(2, 3, 4), torch.rand(2, 3, 4))
     mat_th = torch.complex(mat.real, mat.imag)
     norm = complex_norm(mat, dim=dim, keepdim=True)
     norm_th = complex_norm(mat_th, dim=dim, keepdim=True)
-    assert (
-        torch.allclose(norm, norm_th)
-        and norm.ndim == mat.ndim
-        and mat.numel() == norm.numel() * mat.size(dim)
-    )
+    assert torch.allclose(norm, norm_th)
+    if dim is not None:
+        assert norm.ndim == mat.ndim and mat.numel() == norm.numel() * mat.size(dim)
 
 
 @pytest.mark.parametrize("real_vec", [True, False])
diff --git a/test/espnet2/enh/loss/criterions/test_tf_domain.py b/test/espnet2/enh/loss/criterions/test_tf_domain.py
index 75e13037217..9d1cec94a1d 100644
--- a/test/espnet2/enh/loss/criterions/test_tf_domain.py
+++ b/test/espnet2/enh/loss/criterions/test_tf_domain.py
@@ -1,30 +1,115 @@
+from distutils.version import LooseVersion
 import pytest
 import torch
 
 from torch_complex import ComplexTensor
 
+from espnet2.enh.loss.criterions.tf_domain import FrequencyDomainAbsCoherence
+from espnet2.enh.loss.criterions.tf_domain import FrequencyDomainCrossEntropy
+from espnet2.enh.loss.criterions.tf_domain import FrequencyDomainDPCL
 from espnet2.enh.loss.criterions.tf_domain import FrequencyDomainL1
 from espnet2.enh.loss.criterions.tf_domain import FrequencyDomainMSE
 
 
+is_torch_1_9_plus = LooseVersion(torch.__version__) >= LooseVersion("1.9.0")
+
+
 @pytest.mark.parametrize("criterion_class", [FrequencyDomainL1, FrequencyDomainMSE])
 @pytest.mark.parametrize(
     "mask_type", ["IBM", "IRM", "IAM", "PSM", "NPSM", "PSM^2", "CIRM"]
 )
 @pytest.mark.parametrize("compute_on_mask", [True, False])
-def test_tf_domain_criterion_forward(criterion_class, mask_type, compute_on_mask):
+@pytest.mark.parametrize("input_ch", [1, 2])
+def test_tf_domain_criterion_forward(
+    criterion_class, mask_type, compute_on_mask, input_ch
+):
 
     criterion = criterion_class(compute_on_mask=compute_on_mask, mask_type=mask_type)
+    complex_wrapper = torch.complex if is_torch_1_9_plus else ComplexTensor
 
     batch = 2
-    inf = [torch.rand(batch, 10, 200)]
-    ref_spec = [ComplexTensor(torch.rand(batch, 10, 200), torch.rand(batch, 10, 200))]
-    mix_spec = ComplexTensor(torch.rand(batch, 10, 200), torch.rand(batch, 10, 200))
+    shape = (batch, 10, 200) if input_ch == 1 else (batch, 10, input_ch, 200)
+    ref_spec = [complex_wrapper(torch.rand(*shape), torch.rand(*shape))]
+    mix_spec = complex_wrapper(torch.rand(*shape), torch.rand(*shape))
+    noise_spec = complex_wrapper(torch.rand(*shape), torch.rand(*shape))
 
     if compute_on_mask:
-        ref = criterion.create_mask_label(mix_spec, ref_spec)
+        inf = [torch.rand(*shape)]
+        ref = criterion.create_mask_label(mix_spec, ref_spec, noise_spec=noise_spec)
+        loss = criterion(ref[0], inf[0])
     else:
-        ref = [abs(r) for r in ref_spec]
+        inf_spec = [complex_wrapper(torch.rand(*shape), torch.rand(*shape))]
+        loss = criterion(ref_spec[0], inf_spec[0])
+
+    assert loss.shape == (batch,), "Invlid loss shape with " + criterion.name
+
+
+@pytest.mark.parametrize("input_ch", [1, 2])
+def test_tf_coh_criterion_forward(input_ch):
+
+    criterion = FrequencyDomainAbsCoherence()
+    complex_wrapper = torch.complex if is_torch_1_9_plus else ComplexTensor
+
+    batch = 2
+    shape = (batch, 10, 200) if input_ch == 1 else (batch, 10, input_ch, 200)
+    inf_spec = complex_wrapper(torch.rand(*shape), torch.rand(*shape))
+    ref_spec = complex_wrapper(torch.rand(*shape), torch.rand(*shape))
+
+    loss = criterion(ref_spec, inf_spec)
+    assert loss.shape == (batch,), "Invlid loss shape with " + criterion.name
+
+
+@pytest.mark.parametrize("input_ch", [1, 2])
+def test_tf_coh_criterion_invalid_forward(input_ch):
+
+    criterion = FrequencyDomainAbsCoherence()
+    complex_wrapper = torch.complex if is_torch_1_9_plus else ComplexTensor
+
+    batch = 2
+    shape = (batch, 10, 200) if input_ch == 1 else (batch, 10, input_ch, 200)
+    inf_spec = complex_wrapper(torch.rand(*shape), torch.rand(*shape))
+    ref_spec = complex_wrapper(torch.rand(*shape), torch.rand(*shape))
+
+    with pytest.raises(ValueError):
+        criterion(ref_spec.real, inf_spec)
+
+    with pytest.raises(ValueError):
+        if input_ch == 1:
+            criterion(ref_spec[0], inf_spec[0])
+        else:
+            criterion(ref_spec[0, 0], inf_spec[0, 0])
+
+
+@pytest.mark.parametrize("input_ch", [1, 2])
+def test_tf_ce_criterion_forward(input_ch):
+
+    criterion = FrequencyDomainCrossEntropy()
+
+    batch = 2
+    ncls = 200
+    shape = (batch, 10, ncls) if input_ch == 1 else (batch, 10, input_ch, ncls)
+    label_shape = (batch, 10) if input_ch == 1 else (batch, 10, input_ch)
+    inf_spec = torch.rand(*shape)
+    ref_spec = torch.randint(0, ncls, label_shape)
+
+    loss = criterion(ref_spec, inf_spec)
+    assert loss.shape == (batch,), "Invlid loss shape with " + criterion.name
+
+
+@pytest.mark.parametrize("loss_type", ["dpcl", "mdc"])
+def test_tf_dpcl_loss_criterion_forward(loss_type):
+
+    criterion = FrequencyDomainDPCL(loss_type=loss_type)
+
+    batch = 2
+    inf = torch.rand(batch, 10 * 200, 40)
+    ref_spec = [
+        ComplexTensor(torch.rand(batch, 10, 200), torch.rand(batch, 10, 200)),
+        ComplexTensor(torch.rand(batch, 10, 200), torch.rand(batch, 10, 200)),
+        ComplexTensor(torch.rand(batch, 10, 200), torch.rand(batch, 10, 200)),
+    ]
+
+    ref = [abs(r) for r in ref_spec]
 
-    loss = criterion(ref[0], inf[0])
-    assert loss.shape == (batch,)
+    loss = criterion(ref, inf)
+    assert loss.shape == (batch,), "Invlid loss shape with " + criterion.name
diff --git a/test/espnet2/enh/loss/criterions/test_time_domain.py b/test/espnet2/enh/loss/criterions/test_time_domain.py
index 250aa604cd0..208b23ab85f 100644
--- a/test/espnet2/enh/loss/criterions/test_time_domain.py
+++ b/test/espnet2/enh/loss/criterions/test_time_domain.py
@@ -2,11 +2,14 @@
 import torch
 
 from espnet2.enh.loss.criterions.time_domain import CISDRLoss
+from espnet2.enh.loss.criterions.time_domain import SDRLoss
 from espnet2.enh.loss.criterions.time_domain import SISNRLoss
 from espnet2.enh.loss.criterions.time_domain import SNRLoss
+from espnet2.enh.loss.criterions.time_domain import TimeDomainL1
+from espnet2.enh.loss.criterions.time_domain import TimeDomainMSE
 
 
-@pytest.mark.parametrize("criterion_class", [CISDRLoss, SISNRLoss, SNRLoss])
+@pytest.mark.parametrize("criterion_class", [CISDRLoss, SISNRLoss, SNRLoss, SDRLoss])
 def test_tf_domain_criterion_forward(criterion_class):
 
     criterion = criterion_class()
@@ -16,4 +19,25 @@ def test_tf_domain_criterion_forward(criterion_class):
     ref = torch.rand(batch, 2000)
 
     loss = criterion(ref, inf)
-    assert loss.shape == (batch,)
+    assert loss.shape == (batch,), "Invlid loss shape with " + criterion.name
+
+
+@pytest.mark.parametrize("criterion_class", [TimeDomainL1, TimeDomainMSE])
+@pytest.mark.parametrize("input_ch", [1, 2])
+def test_tf_domain_l1_l2_forward(criterion_class, input_ch):
+
+    criterion = criterion_class()
+
+    batch = 2
+    shape = (batch, 200) if input_ch == 1 else (batch, 200, input_ch)
+    inf = torch.rand(*shape)
+    ref = torch.rand(*shape)
+
+    loss = criterion(ref, inf)
+    assert loss.shape == (batch,), "Invlid loss shape with " + criterion.name
+
+    with pytest.raises(ValueError):
+        if input_ch == 1:
+            loss = criterion(ref[..., None, None], inf[..., None, None])
+        else:
+            loss = criterion(ref[..., None], inf[..., None])
diff --git a/test/espnet2/enh/loss/wrappers/test_dpcl_solver.py b/test/espnet2/enh/loss/wrappers/test_dpcl_solver.py
new file mode 100644
index 00000000000..b5c14c78c3e
--- /dev/null
+++ b/test/espnet2/enh/loss/wrappers/test_dpcl_solver.py
@@ -0,0 +1,17 @@
+import pytest
+import torch
+
+from espnet2.enh.loss.criterions.tf_domain import FrequencyDomainDPCL
+from espnet2.enh.loss.wrappers.dpcl_solver import DPCLSolver
+
+
+@pytest.mark.parametrize("num_spk", [1, 2, 3])
+def test_DPCLSolver_forward(num_spk):
+
+    batch = 2
+    o = {"tf_embedding": torch.rand(batch, 10 * 200, 40)}
+    inf = [torch.rand(batch, 10, 200) for spk in range(num_spk)]
+    ref = [inf[num_spk - spk - 1] for spk in range(num_spk)]  # reverse inf as ref
+    solver = DPCLSolver(FrequencyDomainDPCL())
+
+    loss, stats, others = solver(ref, inf, o)
diff --git a/test/espnet2/enh/loss/wrappers/test_multilayer_pit_solver.py b/test/espnet2/enh/loss/wrappers/test_multilayer_pit_solver.py
new file mode 100644
index 00000000000..3505a007eee
--- /dev/null
+++ b/test/espnet2/enh/loss/wrappers/test_multilayer_pit_solver.py
@@ -0,0 +1,52 @@
+import pytest
+
+import torch
+
+from espnet2.enh.loss.criterions.tf_domain import FrequencyDomainL1
+from espnet2.enh.loss.wrappers.multilayer_pit_solver import MultiLayerPITSolver
+
+
+@pytest.mark.parametrize("num_spk", [1, 2, 3])
+def test_MultiLayerPITSolver_forward_multi_layer(num_spk):
+
+    batch = 2
+    num_layers = 2
+    # infs is a List of List (num_layer x num_speaker Tensors)
+    infs = [
+        [torch.rand(batch, 10, 100) for spk in range(num_spk)]
+        for _ in range(num_layers)
+    ]
+    ref = [infs[-1][num_spk - spk - 1] for spk in range(num_spk)]  # reverse inf as ref
+    solver = MultiLayerPITSolver(FrequencyDomainL1(), independent_perm=True)
+
+    loss, stats, others = solver(ref, infs)
+    perm = others["perm"]
+    correct_perm = list(range(num_spk))
+    correct_perm.reverse()
+    assert perm[0].equal(torch.tensor(correct_perm))
+
+    # test for independent_perm is False
+
+    solver = MultiLayerPITSolver(FrequencyDomainL1(), independent_perm=False)
+    loss, stats, others = solver(ref, infs, {"perm": perm})
+
+
+@pytest.mark.parametrize("num_spk", [1, 2, 3])
+def test_MultiLayerPITSolver_forward_single_layer(num_spk):
+
+    batch = 2
+    # inf is a List of Tensors
+    inf = [torch.rand(batch, 10, 100) for spk in range(num_spk)]
+    ref = [inf[num_spk - spk - 1] for spk in range(num_spk)]  # reverse inf as ref
+    solver = MultiLayerPITSolver(FrequencyDomainL1(), independent_perm=True)
+
+    loss, stats, others = solver(ref, inf)
+    perm = others["perm"]
+    correct_perm = list(range(num_spk))
+    correct_perm.reverse()
+    assert perm[0].equal(torch.tensor(correct_perm))
+
+    # test for independent_perm is False
+
+    solver = MultiLayerPITSolver(FrequencyDomainL1(), independent_perm=False)
+    loss, stats, others = solver(ref, inf, {"perm": perm})
diff --git a/test/espnet2/enh/loss/wrappers/test_pit_solver.py b/test/espnet2/enh/loss/wrappers/test_pit_solver.py
index 40248a6b6bd..ddba099e17e 100644
--- a/test/espnet2/enh/loss/wrappers/test_pit_solver.py
+++ b/test/espnet2/enh/loss/wrappers/test_pit_solver.py
@@ -1,6 +1,8 @@
 import pytest
 import torch
+import torch.nn.functional as F
 
+from espnet2.enh.loss.criterions.tf_domain import FrequencyDomainCrossEntropy
 from espnet2.enh.loss.criterions.tf_domain import FrequencyDomainL1
 from espnet2.enh.loss.wrappers.pit_solver import PITSolver
 
@@ -23,3 +25,25 @@ def test_PITSolver_forward(num_spk):
 
     solver = PITSolver(FrequencyDomainL1(), independent_perm=False)
     loss, stats, others = solver(ref, inf, {"perm": perm})
+
+
+@pytest.mark.parametrize("num_spk", [1, 2, 3])
+def test_PITSolver_tf_ce_forward(num_spk):
+
+    batch = 2
+    ncls = 100
+    ref = [torch.randint(0, ncls, (batch, 10)) for spk in range(num_spk)]
+    bias = [F.one_hot(y) for y in ref]
+    bias = [F.pad(y, (0, ncls - y.size(-1))) for y in bias]
+    inf = [torch.rand(batch, 10, ncls) + bias[spk] for spk in range(num_spk)]
+    solver = PITSolver(FrequencyDomainCrossEntropy(), independent_perm=True)
+
+    loss, stats, others = solver(ref, inf)
+    perm = others["perm"]
+    correct_perm = list(range(num_spk))
+    assert perm[0].equal(torch.tensor(correct_perm)), (perm, correct_perm)
+
+    # test for independent_perm is False
+
+    solver = PITSolver(FrequencyDomainCrossEntropy(), independent_perm=False)
+    loss, stats, others = solver(ref, inf, {"perm": perm})
diff --git a/test/espnet2/enh/separator/test_dan_separator.py b/test/espnet2/enh/separator/test_dan_separator.py
new file mode 100644
index 00000000000..675176bea8a
--- /dev/null
+++ b/test/espnet2/enh/separator/test_dan_separator.py
@@ -0,0 +1,130 @@
+import pytest
+
+import torch
+from torch import Tensor
+from torch_complex import ComplexTensor
+
+from espnet2.enh.separator.dan_separator import DANSeparator
+
+
+@pytest.mark.parametrize("input_dim", [5])
+@pytest.mark.parametrize("rnn_type", ["blstm"])
+@pytest.mark.parametrize("layer", [1, 3])
+@pytest.mark.parametrize("unit", [8])
+@pytest.mark.parametrize("dropout", [0.0, 0.2])
+@pytest.mark.parametrize("num_spk", [2])
+@pytest.mark.parametrize("emb_D", [40])
+@pytest.mark.parametrize("nonlinear", ["relu", "sigmoid", "tanh"])
+def test_dan_separator_forward_backward_complex(
+    input_dim, rnn_type, layer, unit, dropout, num_spk, emb_D, nonlinear
+):
+    model = DANSeparator(
+        input_dim=input_dim,
+        rnn_type=rnn_type,
+        layer=layer,
+        unit=unit,
+        dropout=dropout,
+        num_spk=num_spk,
+        emb_D=emb_D,
+        nonlinear=nonlinear,
+    )
+    model.train()
+
+    real = torch.rand(2, 10, input_dim)
+    imag = torch.rand(2, 10, input_dim)
+    x = ComplexTensor(real, imag)
+    x_lens = torch.tensor([10, 8], dtype=torch.long)
+
+    o = []
+    for i in range(num_spk):
+        o.append(ComplexTensor(real, imag))
+
+    sep_others = {}
+    sep_others["feature_ref"] = o
+
+    masked, flens, others = model(x, ilens=x_lens, additional=sep_others)
+
+    assert isinstance(masked[0], ComplexTensor)
+    assert len(masked) == num_spk
+
+    masked[0].abs().mean().backward()
+
+
+@pytest.mark.parametrize("input_dim", [5])
+@pytest.mark.parametrize("rnn_type", ["blstm"])
+@pytest.mark.parametrize("layer", [1, 3])
+@pytest.mark.parametrize("unit", [8])
+@pytest.mark.parametrize("dropout", [0.0, 0.2])
+@pytest.mark.parametrize("num_spk", [1, 2])
+@pytest.mark.parametrize("emb_D", [40])
+@pytest.mark.parametrize("nonlinear", ["relu", "sigmoid", "tanh"])
+def test_dan_separator_forward_backward_real(
+    input_dim, rnn_type, layer, unit, dropout, num_spk, emb_D, nonlinear
+):
+    model = DANSeparator(
+        input_dim=input_dim,
+        rnn_type=rnn_type,
+        layer=layer,
+        unit=unit,
+        dropout=dropout,
+        num_spk=num_spk,
+        emb_D=emb_D,
+        nonlinear=nonlinear,
+    )
+    model.train()
+
+    x = torch.rand(2, 10, input_dim)
+    x_lens = torch.tensor([10, 8], dtype=torch.long)
+
+    o = []
+    for i in range(num_spk):
+        o.append(ComplexTensor(x, x))
+
+    sep_others = {}
+    sep_others["feature_ref"] = o
+
+    masked, flens, others = model(x, ilens=x_lens, additional=sep_others)
+
+    assert isinstance(masked[0], Tensor)
+    assert len(masked) == num_spk
+
+    masked[0].abs().mean().backward()
+
+
+def test_dan_separator_invalid_type():
+    with pytest.raises(ValueError):
+        DANSeparator(
+            input_dim=10,
+            rnn_type="rnn",
+            layer=2,
+            unit=10,
+            dropout=0.1,
+            num_spk=2,
+            emb_D=40,
+            nonlinear="fff",
+        )
+
+
+def test_dan_separator_output():
+
+    x = torch.rand(1, 10, 10)
+    x_lens = torch.tensor([10], dtype=torch.long)
+
+    for num_spk in range(1, 4):
+        model = DANSeparator(
+            input_dim=10,
+            rnn_type="rnn",
+            layer=2,
+            unit=10,
+            dropout=0.1,
+            num_spk=num_spk,
+            emb_D=40,
+            nonlinear="relu",
+        )
+        model.eval()
+        specs, _, others = model(x, x_lens)
+        assert isinstance(specs, list)
+        assert isinstance(others, dict)
+        for n in range(num_spk):
+            assert "mask_spk{}".format(n + 1) in others
+            assert specs[n].shape == others["mask_spk{}".format(n + 1)].shape
diff --git a/test/espnet2/enh/separator/test_dpcl_e2e_separator.py b/test/espnet2/enh/separator/test_dpcl_e2e_separator.py
new file mode 100644
index 00000000000..c470a9ee83f
--- /dev/null
+++ b/test/espnet2/enh/separator/test_dpcl_e2e_separator.py
@@ -0,0 +1,146 @@
+import pytest
+
+import torch
+from torch import Tensor
+from torch_complex import ComplexTensor
+
+from espnet2.enh.separator.dpcl_e2e_separator import DPCLE2ESeparator
+
+
+@pytest.mark.parametrize("input_dim", [5])
+@pytest.mark.parametrize("rnn_type", ["blstm"])
+@pytest.mark.parametrize("layer", [1, 3])
+@pytest.mark.parametrize("unit", [8])
+@pytest.mark.parametrize("dropout", [0.0, 0.2])
+@pytest.mark.parametrize("num_spk", [2])
+@pytest.mark.parametrize("emb_D", [40])
+@pytest.mark.parametrize("nonlinear", ["relu", "sigmoid", "tanh"])
+@pytest.mark.parametrize("alpha", [1.0, 5.0])
+@pytest.mark.parametrize("max_iteration", [100, 500])
+def test_dpcl_e2e_separator_forward_backward_complex(
+    input_dim,
+    rnn_type,
+    layer,
+    unit,
+    dropout,
+    num_spk,
+    emb_D,
+    nonlinear,
+    alpha,
+    max_iteration,
+):
+    model = DPCLE2ESeparator(
+        input_dim=input_dim,
+        rnn_type=rnn_type,
+        layer=layer,
+        unit=unit,
+        dropout=dropout,
+        num_spk=num_spk,
+        emb_D=emb_D,
+        nonlinear=nonlinear,
+        alpha=alpha,
+        max_iteration=max_iteration,
+    )
+    model.train()
+
+    real = torch.rand(2, 10, input_dim)
+    imag = torch.rand(2, 10, input_dim)
+    x = ComplexTensor(real, imag)
+    x_lens = torch.tensor([10, 8], dtype=torch.long)
+
+    masked, flens, others = model(x, ilens=x_lens)
+
+    assert isinstance(masked[0], ComplexTensor)
+    assert len(masked) == num_spk
+
+    masked[0].abs().mean().backward()
+
+
+@pytest.mark.parametrize("input_dim", [5])
+@pytest.mark.parametrize("rnn_type", ["blstm"])
+@pytest.mark.parametrize("layer", [1, 3])
+@pytest.mark.parametrize("unit", [8])
+@pytest.mark.parametrize("dropout", [0.0, 0.2])
+@pytest.mark.parametrize("num_spk", [2])
+@pytest.mark.parametrize("emb_D", [40])
+@pytest.mark.parametrize("nonlinear", ["relu", "sigmoid", "tanh"])
+@pytest.mark.parametrize("alpha", [1.0, 5.0])
+@pytest.mark.parametrize("max_iteration", [100, 500])
+def test_dpcl_e2e_separator_forward_backward_real(
+    input_dim,
+    rnn_type,
+    layer,
+    unit,
+    dropout,
+    num_spk,
+    emb_D,
+    nonlinear,
+    alpha,
+    max_iteration,
+):
+    model = DPCLE2ESeparator(
+        input_dim=input_dim,
+        rnn_type=rnn_type,
+        layer=layer,
+        unit=unit,
+        dropout=dropout,
+        num_spk=num_spk,
+        emb_D=emb_D,
+        nonlinear=nonlinear,
+        alpha=alpha,
+        max_iteration=max_iteration,
+    )
+    model.train()
+
+    x = torch.rand(2, 10, input_dim)
+    x_lens = torch.tensor([10, 8], dtype=torch.long)
+
+    masked, flens, others = model(x, ilens=x_lens)
+
+    assert isinstance(masked[0], Tensor)
+    assert len(masked) == num_spk
+
+    masked[0].abs().mean().backward()
+
+
+def test_dpcl_e2e_separator_invalid_type():
+    with pytest.raises(ValueError):
+        DPCLE2ESeparator(
+            input_dim=10,
+            rnn_type="rnn",
+            layer=2,
+            unit=10,
+            dropout=0.1,
+            num_spk=2,
+            emb_D=40,
+            nonlinear="fff",
+            alpha=5.0,
+            max_iteration=100,
+        )
+
+
+def test_dpcl_e2e_separator_output():
+
+    x = torch.rand(1, 10, 10)
+    x_lens = torch.tensor([10], dtype=torch.long)
+
+    for num_spk in range(1, 4):
+        model = DPCLE2ESeparator(
+            input_dim=10,
+            rnn_type="rnn",
+            layer=2,
+            unit=10,
+            dropout=0.1,
+            num_spk=num_spk,
+            emb_D=40,
+            nonlinear="relu",
+            alpha=5.0,
+            max_iteration=100,
+        )
+        model.eval()
+        specs, _, others = model(x, x_lens)
+        assert isinstance(specs, list)
+        assert isinstance(others, dict)
+        for n in range(num_spk):
+            assert "mask_spk{}".format(n + 1) in others
+            assert specs[n].shape == others["mask_spk{}".format(n + 1)].shape
diff --git a/test/espnet2/enh/separator/test_dpcl_separator.py b/test/espnet2/enh/separator/test_dpcl_separator.py
new file mode 100644
index 00000000000..19304579af8
--- /dev/null
+++ b/test/espnet2/enh/separator/test_dpcl_separator.py
@@ -0,0 +1,113 @@
+import pytest
+
+import torch
+from torch_complex import ComplexTensor
+
+from espnet2.enh.separator.dpcl_separator import DPCLSeparator
+
+
+@pytest.mark.parametrize("input_dim", [5])
+@pytest.mark.parametrize("rnn_type", ["blstm"])
+@pytest.mark.parametrize("layer", [1, 3])
+@pytest.mark.parametrize("unit", [8])
+@pytest.mark.parametrize("dropout", [0.0, 0.2])
+@pytest.mark.parametrize("num_spk", [2])
+@pytest.mark.parametrize("emb_D", [40])
+@pytest.mark.parametrize("nonlinear", ["relu", "sigmoid", "tanh"])
+def test_dpcl_separator_forward_backward_complex(
+    input_dim, rnn_type, layer, unit, dropout, num_spk, emb_D, nonlinear
+):
+    model = DPCLSeparator(
+        input_dim=input_dim,
+        rnn_type=rnn_type,
+        layer=layer,
+        unit=unit,
+        dropout=dropout,
+        num_spk=num_spk,
+        emb_D=emb_D,
+        nonlinear=nonlinear,
+    )
+    model.train()
+
+    real = torch.rand(2, 10, input_dim)
+    imag = torch.rand(2, 10, input_dim)
+    x = ComplexTensor(real, imag)
+    x_lens = torch.tensor([10, 8], dtype=torch.long)
+
+    masked, flens, others = model(x, ilens=x_lens)
+
+    assert "tf_embedding" in others
+
+    others["tf_embedding"].abs().mean().backward()
+
+
+@pytest.mark.parametrize("input_dim", [5])
+@pytest.mark.parametrize("rnn_type", ["blstm"])
+@pytest.mark.parametrize("layer", [1, 3])
+@pytest.mark.parametrize("unit", [8])
+@pytest.mark.parametrize("dropout", [0.0, 0.2])
+@pytest.mark.parametrize("num_spk", [1, 2])
+@pytest.mark.parametrize("emb_D", [40])
+@pytest.mark.parametrize("nonlinear", ["relu", "sigmoid", "tanh"])
+def test_dpcl_separator_forward_backward_real(
+    input_dim, rnn_type, layer, unit, dropout, num_spk, emb_D, nonlinear
+):
+    model = DPCLSeparator(
+        input_dim=input_dim,
+        rnn_type=rnn_type,
+        layer=layer,
+        unit=unit,
+        dropout=dropout,
+        num_spk=num_spk,
+        emb_D=emb_D,
+        nonlinear=nonlinear,
+    )
+    model.train()
+
+    x = torch.rand(2, 10, input_dim)
+    x_lens = torch.tensor([10, 8], dtype=torch.long)
+
+    masked, flens, others = model(x, ilens=x_lens)
+
+    assert "tf_embedding" in others
+
+    others["tf_embedding"].abs().mean().backward()
+
+
+def test_dpcl_separator_invalid_type():
+    with pytest.raises(ValueError):
+        DPCLSeparator(
+            input_dim=10,
+            rnn_type="rnn",
+            layer=2,
+            unit=10,
+            dropout=0.1,
+            num_spk=2,
+            emb_D=40,
+            nonlinear="fff",
+        )
+
+
+def test_dpcl_separator_output():
+
+    x = torch.rand(2, 10, 10)
+    x_lens = torch.tensor([10, 8], dtype=torch.long)
+
+    for num_spk in range(1, 4):
+        model = DPCLSeparator(
+            input_dim=10,
+            rnn_type="rnn",
+            layer=2,
+            unit=10,
+            dropout=0.1,
+            num_spk=num_spk,
+            emb_D=40,
+            nonlinear="relu",
+        )
+        model.eval()
+        specs, _, others = model(x, x_lens)
+        assert isinstance(specs, list)
+        assert isinstance(others, dict)
+        assert len(specs) == num_spk, len(specs)
+        for n in range(num_spk):
+            assert "tf_embedding" in others
diff --git a/test/espnet2/enh/separator/test_svoice_separator.py b/test/espnet2/enh/separator/test_svoice_separator.py
new file mode 100644
index 00000000000..b2fb191856c
--- /dev/null
+++ b/test/espnet2/enh/separator/test_svoice_separator.py
@@ -0,0 +1,95 @@
+import pytest
+
+import torch
+from torch import Tensor
+
+from espnet2.enh.separator.svoice_separator import SVoiceSeparator
+
+
+@pytest.mark.parametrize("input_dim", [1])
+@pytest.mark.parametrize("enc_dim", [4])
+@pytest.mark.parametrize("kernel_size", [4])
+@pytest.mark.parametrize("hidden_size", [4])
+@pytest.mark.parametrize("num_spk", [1, 2])
+@pytest.mark.parametrize("num_layers", [1, 2])
+@pytest.mark.parametrize("segment_size", [2])
+@pytest.mark.parametrize("bidirectional", [False])
+@pytest.mark.parametrize("input_normalize", [False])
+def test_svoice_separator_forward_backward(
+    input_dim,
+    enc_dim,
+    kernel_size,
+    hidden_size,
+    num_spk,
+    num_layers,
+    segment_size,
+    bidirectional,
+    input_normalize,
+):
+    model = SVoiceSeparator(
+        input_dim=input_dim,
+        enc_dim=enc_dim,
+        kernel_size=kernel_size,
+        hidden_size=hidden_size,
+        num_spk=num_spk,
+        num_layers=num_layers,
+        segment_size=segment_size,
+        bidirectional=bidirectional,
+        input_normalize=input_normalize,
+    )
+    model.train()
+
+    x = torch.rand(2, 800)
+    x_lens = torch.tensor([400, 300], dtype=torch.long)
+
+    separated, _, _ = model(x, ilens=x_lens)
+
+    assert isinstance(separated[0][0], Tensor)
+    assert len(separated) == num_layers
+
+    separated[0][0].mean().backward()
+
+
+def test_svoice_separator_output_train():
+    x = torch.rand(2, 800)
+    x_lens = torch.tensor([10, 8], dtype=torch.long)
+
+    for num_spk in range(1, 3):
+        model = SVoiceSeparator(
+            input_dim=12,
+            enc_dim=8,
+            kernel_size=8,
+            hidden_size=8,
+            num_spk=num_spk,
+            num_layers=4,
+            segment_size=2,
+            bidirectional=False,
+            input_normalize=False,
+        )
+        model.train()
+        waveforms, _, _ = model(x, x_lens)
+        assert isinstance(waveforms, list)
+        assert isinstance(waveforms[0], list)
+        assert x[0].shape == waveforms[0][0][0].shape
+
+
+def test_svoice_separator_output_eval():
+    x = torch.rand(2, 800)
+    x_lens = torch.tensor([10, 8], dtype=torch.long)
+
+    for num_spk in range(1, 3):
+        model = SVoiceSeparator(
+            input_dim=12,
+            enc_dim=8,
+            kernel_size=8,
+            hidden_size=8,
+            num_spk=num_spk,
+            num_layers=4,
+            segment_size=2,
+            bidirectional=False,
+            input_normalize=False,
+        )
+        model.eval()
+        waveforms, _, _ = model(x, x_lens)
+        assert isinstance(waveforms, list)
+        assert x[0].shape == waveforms[0][0].shape
diff --git a/test/espnet2/enh/test_espnet_enh_s2t_model.py b/test/espnet2/enh/test_espnet_enh_s2t_model.py
new file mode 100644
index 00000000000..5f7df398130
--- /dev/null
+++ b/test/espnet2/enh/test_espnet_enh_s2t_model.py
@@ -0,0 +1,129 @@
+import pytest
+import torch
+
+from espnet2.asr.ctc import CTC
+from espnet2.asr.decoder.transformer_decoder import TransformerDecoder
+from espnet2.asr.encoder.transformer_encoder import TransformerEncoder
+from espnet2.asr.espnet_model import ESPnetASRModel
+from espnet2.asr.frontend.default import DefaultFrontend
+from espnet2.enh.decoder.stft_decoder import STFTDecoder
+from espnet2.enh.encoder.stft_encoder import STFTEncoder
+from espnet2.enh.espnet_enh_s2t_model import ESPnetEnhS2TModel
+from espnet2.enh.espnet_model import ESPnetEnhancementModel
+from espnet2.enh.loss.criterions.time_domain import SISNRLoss
+from espnet2.enh.loss.wrappers.fixed_order import FixedOrderSolver
+from espnet2.enh.separator.rnn_separator import RNNSeparator
+
+
+enh_stft_encoder = STFTEncoder(
+    n_fft=32,
+    hop_length=16,
+)
+
+enh_stft_decoder = STFTDecoder(
+    n_fft=32,
+    hop_length=16,
+)
+
+enh_rnn_separator = RNNSeparator(
+    input_dim=17,
+    layer=1,
+    unit=10,
+    num_spk=1,
+)
+
+si_snr_loss = SISNRLoss()
+
+fix_order_solver = FixedOrderSolver(criterion=si_snr_loss)
+
+default_frontend = DefaultFrontend(
+    fs=300,
+    n_fft=32,
+    win_length=32,
+    hop_length=24,
+    n_mels=32,
+)
+
+token_list = ["<blank>", "<space>", "a", "e", "i", "o", "u", "<sos/eos>"]
+
+asr_transformer_encoder = TransformerEncoder(
+    32,
+    output_size=16,
+    linear_units=16,
+    num_blocks=2,
+)
+
+asr_transformer_decoder = TransformerDecoder(
+    len(token_list),
+    16,
+    linear_units=16,
+    num_blocks=2,
+)
+
+asr_ctc = CTC(odim=len(token_list), encoder_output_size=16)
+
+
+@pytest.mark.parametrize(
+    "enh_encoder, enh_decoder",
+    [(enh_stft_encoder, enh_stft_decoder)],
+)
+@pytest.mark.parametrize("enh_separator", [enh_rnn_separator])
+@pytest.mark.parametrize("training", [True, False])
+@pytest.mark.parametrize("loss_wrappers", [[fix_order_solver]])
+@pytest.mark.parametrize("frontend", [default_frontend])
+@pytest.mark.parametrize("s2t_encoder", [asr_transformer_encoder])
+@pytest.mark.parametrize("s2t_decoder", [asr_transformer_decoder])
+@pytest.mark.parametrize("s2t_ctc", [asr_ctc])
+def test_enh_asr_model(
+    enh_encoder,
+    enh_decoder,
+    enh_separator,
+    training,
+    loss_wrappers,
+    frontend,
+    s2t_encoder,
+    s2t_decoder,
+    s2t_ctc,
+):
+    inputs = torch.randn(2, 300)
+    ilens = torch.LongTensor([300, 200])
+    speech_ref = torch.randn(2, 300).float()
+    text = torch.LongTensor([[1, 2, 3, 4, 5], [5, 4, 3, 2, 1]])
+    text_lengths = torch.LongTensor([5, 5])
+    enh_model = ESPnetEnhancementModel(
+        encoder=enh_encoder,
+        separator=enh_separator,
+        decoder=enh_decoder,
+        loss_wrappers=loss_wrappers,
+    )
+    s2t_model = ESPnetASRModel(
+        vocab_size=len(token_list),
+        token_list=token_list,
+        frontend=frontend,
+        encoder=s2t_encoder,
+        decoder=s2t_decoder,
+        ctc=s2t_ctc,
+        specaug=None,
+        normalize=None,
+        preencoder=None,
+        postencoder=None,
+        joint_network=None,
+    )
+    enh_s2t_model = ESPnetEnhS2TModel(
+        enh_model=enh_model,
+        s2t_model=s2t_model,
+    )
+
+    if training:
+        enh_s2t_model.train()
+    else:
+        enh_s2t_model.eval()
+
+    kwargs = {
+        "speech": inputs,
+        "speech_lengths": ilens,
+        "speech_ref1": speech_ref,
+        "text": text,
+        "text_lengths": text_lengths,
+    }
+    loss, stats, weight = enh_s2t_model(**kwargs)
diff --git a/test/espnet2/enh/test_espnet_model.py b/test/espnet2/enh/test_espnet_model.py
index f3a29f1757d..6985ab63e36 100644
--- a/test/espnet2/enh/test_espnet_model.py
+++ b/test/espnet2/enh/test_espnet_model.py
@@ -4,20 +4,24 @@
 import torch
 
 from espnet2.enh.decoder.conv_decoder import ConvDecoder
+from espnet2.enh.decoder.null_decoder import NullDecoder
 from espnet2.enh.decoder.stft_decoder import STFTDecoder
 from espnet2.enh.encoder.conv_encoder import ConvEncoder
+from espnet2.enh.encoder.null_encoder import NullEncoder
 from espnet2.enh.encoder.stft_encoder import STFTEncoder
 from espnet2.enh.espnet_model import ESPnetEnhancementModel
 from espnet2.enh.loss.criterions.tf_domain import FrequencyDomainL1
 from espnet2.enh.loss.criterions.tf_domain import FrequencyDomainMSE
 from espnet2.enh.loss.criterions.time_domain import SISNRLoss
 from espnet2.enh.loss.wrappers.fixed_order import FixedOrderSolver
+from espnet2.enh.loss.wrappers.multilayer_pit_solver import MultiLayerPITSolver
 from espnet2.enh.loss.wrappers.pit_solver import PITSolver
 from espnet2.enh.separator.dc_crn_separator import DC_CRNSeparator
 from espnet2.enh.separator.dccrn_separator import DCCRNSeparator
 from espnet2.enh.separator.dprnn_separator import DPRNNSeparator
 from espnet2.enh.separator.neural_beamformer import NeuralBeamformer
 from espnet2.enh.separator.rnn_separator import RNNSeparator
+from espnet2.enh.separator.svoice_separator import SVoiceSeparator
 from espnet2.enh.separator.tcn_separator import TCNSeparator
 from espnet2.enh.separator.transformer_separator import TransformerSeparator
 
@@ -53,6 +57,10 @@
     stride=18,
 )
 
+null_encoder = NullEncoder()
+
+null_decoder = NullDecoder()
+
 rnn_separator = RNNSeparator(
     input_dim=17,
     layer=1,
@@ -65,6 +73,18 @@
 
 dprnn_separator = DPRNNSeparator(input_dim=17, layer=1, unit=10, segment_size=4)
 
+svoice_separator = SVoiceSeparator(
+    input_dim=17,
+    enc_dim=4,
+    kernel_size=4,
+    hidden_size=4,
+    num_spk=2,
+    num_layers=2,
+    segment_size=4,
+    bidirectional=False,
+    input_normalize=False,
+)
+
 tcn_separator = TCNSeparator(
     input_dim=17,
     layer=2,
@@ -87,6 +107,7 @@
 tf_l1_loss = FrequencyDomainL1()
 
 pit_wrapper = PITSolver(criterion=si_snr_loss)
+multilayer_pit_solver = MultiLayerPITSolver(criterion=si_snr_loss)
 fix_order_solver = FixedOrderSolver(criterion=tf_mse_loss)
 
 
@@ -141,6 +162,44 @@ def test_single_channel_model(encoder, decoder, separator, training, loss_wrappe
     loss, stats, weight = enh_model(**kwargs)
 
 
+@pytest.mark.parametrize(
+    "encoder, decoder",
+    [
+        (null_encoder, null_decoder),
+    ],
+)
+@pytest.mark.parametrize(
+    "separator",
+    [
+        svoice_separator,
+    ],
+)
+@pytest.mark.parametrize("training", [True, False])
+@pytest.mark.parametrize("loss_wrappers", [[multilayer_pit_solver]])
+def test_svoice_model(encoder, decoder, separator, training, loss_wrappers):
+    inputs = torch.randn(2, 300)
+    ilens = torch.LongTensor([300, 200])
+    speech_refs = [torch.randn(2, 300).float(), torch.randn(2, 300).float()]
+    enh_model = ESPnetEnhancementModel(
+        encoder=encoder,
+        separator=separator,
+        decoder=decoder,
+        loss_wrappers=loss_wrappers,
+    )
+
+    if training:
+        enh_model.train()
+    else:
+        enh_model.eval()
+
+    kwargs = {
+        "speech_mix": inputs,
+        "speech_mix_lengths": ilens,
+        **{"speech_ref{}".format(i + 1): speech_refs[i] for i in range(2)},
+    }
+    loss, stats, weight = enh_model(**kwargs)
+
+
 random_speech = torch.tensor(
     [
         [
@@ -203,13 +262,13 @@ def test_forward_with_beamformer_net(
         # `mask_type` has no effect when `loss_type` is not "mask..."
         return
     if not is_torch_1_9_plus and use_builtin_complex:
-        # builtin complex support is only available in PyTorch 1.8+
+        # builtin complex support is only well supported in PyTorch 1.9+
         return
 
     ch = 3
     inputs = random_speech[..., :ch].float()
     ilens = torch.LongTensor([16, 12])
-    speech_refs = [torch.randn(2, 16, ch).float() for spk in range(num_spk)]
+    speech_refs = [torch.randn(2, 16, dtype=torch.float) for spk in range(num_spk)]
     noise_ref1 = torch.randn(2, 16, ch, dtype=torch.float)
     dereverb_ref1 = torch.randn(2, 16, ch, dtype=torch.float)
     encoder = STFTEncoder(
@@ -253,7 +312,8 @@ def test_forward_with_beamformer_net(
         "speech_mix": inputs,
         "speech_mix_lengths": ilens,
         **{"speech_ref{}".format(i + 1): speech_refs[i] for i in range(num_spk)},
-        "noise_ref1": noise_ref1,
         "dereverb_ref1": dereverb_ref1,
     }
     loss, stats, weight = enh_model(**kwargs)
+    if mask_type in ("IBM", "IRM"):
+        loss, stats, weight = enh_model(**kwargs, noise_ref1=noise_ref1)
diff --git a/test/espnet2/tasks/test_abs_task.py b/test/espnet2/tasks/test_abs_task.py
index b03e35b29f3..7a9297f78e2 100644
--- a/test/espnet2/tasks/test_abs_task.py
+++ b/test/espnet2/tasks/test_abs_task.py
@@ -17,7 +17,7 @@ def __init__(self):
     def collect_feats(self):
         return {}
 
-    def forward(self, x, x_lengths):
+    def forward(self, x, x_lengths, **kwargs):
         x = self.layer1(x)
         x = self.layer2(x)
         retval = {
diff --git a/test/espnet2/tasks/test_enh_s2t.py b/test/espnet2/tasks/test_enh_s2t.py
new file mode 100644
index 00000000000..1d4622d21d5
--- /dev/null
+++ b/test/espnet2/tasks/test_enh_s2t.py
@@ -0,0 +1,36 @@
+import pytest
+
+from espnet2.tasks.enh_s2t import EnhS2TTask
+
+
+def test_add_arguments():
+    EnhS2TTask.get_parser()
+
+
+def test_add_arguments_help():
+    parser = EnhS2TTask.get_parser()
+    with pytest.raises(SystemExit):
+        parser.parse_args(["--help"])
+
+
+def test_main_help():
+    with pytest.raises(SystemExit):
+        EnhS2TTask.main(cmd=["--help"])
+
+
+def test_main_print_config():
+    with pytest.raises(SystemExit):
+        EnhS2TTask.main(cmd=["--print_config"])
+
+
+def test_main_with_no_args():
+    with pytest.raises(SystemExit):
+        EnhS2TTask.main(cmd=[])
+
+
+def test_print_config_and_load_it(tmp_path):
+    config_file = tmp_path / "config.yaml"
+    with config_file.open("w") as f:
+        EnhS2TTask.print_config(f)
+    parser = EnhS2TTask.get_parser()
+    parser.parse_args(["--config", str(config_file)])
diff --git a/test_utils/test_evaluate_asr.bats b/test_utils/test_evaluate_asr.bats
index 3b8b51da792..4831d409412 100644
--- a/test_utils/test_evaluate_asr.bats
+++ b/test_utils/test_evaluate_asr.bats
@@ -15,7 +15,7 @@ EOF
 
 @test "evaluate_asr" {
     cd egs2/mini_an4/asr1
-    model_tag="kamo-naoyuki/mini_an4_asr_train_raw_bpe_valid.acc.best"
+    model_tag="espnet/kamo-naoyuki-mini_an4_asr_train_raw_bpe_valid.acc.best"
     scripts/utils/evaluate_asr.sh \
         --stop-stage 3 \
         --model_tag "${model_tag}" \
diff --git a/test_utils/test_evaluate_asr_hf.bats b/test_utils/test_evaluate_asr_hf.bats
deleted file mode 100644
index 598455b8529..00000000000
--- a/test_utils/test_evaluate_asr_hf.bats
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/usr/bin/env bats
-
-setup() {
-    tmpdir=/tmp/espnet2-test-evaluate-asr-hf-${RANDOM}
-    # Create dummy data
-    mkdir -p ${tmpdir}/data
-    echo "dummy A" > ${tmpdir}/data/text
-    echo "dummy ${tmpdir}/data/dummy.wav" > ${tmpdir}/data/wav.scp
-    python << EOF
-import numpy as np
-import soundfile as sf
-sf.write("${tmpdir}/data/dummy.wav", np.zeros(16000 * 2,), 16000, "PCM_16")
-EOF
-}
-
-@test "evaluate_asr_hf" {
-    cd egs2/mini_an4/asr1
-    model_tag="espnet/kamo-naoyuki-mini_an4_asr_train_raw_bpe_valid.acc.best"
-    scripts/utils/evaluate_asr.sh \
-        --stop-stage 3 \
-        --model_tag "${model_tag}" \
-        --gt_text "${tmpdir}/data/text" \
-        --inference_args "--beam_size 1" \
-        "${tmpdir}/data/wav.scp" "${tmpdir}/asr_results"
-}
-
-teardown() {
-    rm -r $tmpdir
-}
diff --git a/tools/installers/install_deepxi.sh b/tools/installers/install_deepxi.sh
new file mode 100755
index 00000000000..43d49f29ace
--- /dev/null
+++ b/tools/installers/install_deepxi.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+#==============================================================================
+# Title: install_deepxi.sh
+# Description: Install everything necessary for deepxi to compile. 
+# Author: Fabian Hörst, based on DeepXi GitHub page
+# Github DeepXi: https://github.com/anicolson/DeepXi
+# Date: 2021-12-04
+# Version : 1.0
+# Usage: bash install_deepxi.sh
+# Python environment: DeepXi Python environment is saved under ~/venv/DeepXi in
+#	  	      your home directory
+#==============================================================================
+
+# Exit script if any command fails
+set -e 
+set -o pipefail
+
+echo "Installing DeepXi"
+
+# If Direcotry exists, pull missing files
+if [ -d "DeepXi" ]; then
+    cd DeepXi
+    git pull https://github.com/anicolson/DeepXi.git
+    cd ..
+# Clone git in current directory, build virtual environment and install requirements
+else
+    git clone https://github.com/anicolson/DeepXi.git	
+fi
+echo "DeepXi installed"
diff --git a/tools/installers/install_longformer.sh b/tools/installers/install_longformer.sh
index a2f11eecf15..c942abb0dd9 100755
--- a/tools/installers/install_longformer.sh
+++ b/tools/installers/install_longformer.sh
@@ -36,12 +36,12 @@ if ! "${python_36_plus}"; then
     exit 1
 else
 
-    if $(pt_plus 1.6.1); then
-        pip install git+https://github.com/roshansh-cmu/longformer.git
-        pip install datasets bert-score
-	pip install git+https://github.com/Maluuba/nlg-eval.git@master 
+    if $(pt_plus 1.8.0); then
+        python -m pip install git+https://github.com/roshansh-cmu/longformer.git
+        python -m pip install datasets bert-score
+        python -m pip install git+https://github.com/Maluuba/nlg-eval.git@master
     else
-        echo "[WARNING] Longformer requires pytorch>=1.6.1"
+        echo "[WARNING] Longformer requires pytorch>=1.8.*"
     fi
 
 fi
diff --git a/tools/installers/install_openface.sh b/tools/installers/install_openface.sh
new file mode 100755
index 00000000000..8b589ef824e
--- /dev/null
+++ b/tools/installers/install_openface.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+#==============================================================================
+# Title: install_openface.sh
+# Description: Install everything necessary for OpenFace to compile. 
+# Will install all required dependencies, only use if you do not have the dependencies
+# already installed or if you don't mind specific versions of gcc,g++,cmake,opencv etc. installed
+# Author: Fabian Hörst
+# Reference: Thanks to Daniyal Shahrokhian <daniyal@kth.se>, Tadas Baltrusaitis <tadyla@gmail.com>
+#            on which this script is based
+# Github OpenFace: https://github.com/TadasBaltrusaitis/OpenFace
+# Date: 2021-03-30
+# Version : 1.0
+# Usage: bash install.sh, please use just for ubuntu 18.04 or 20.04
+#==============================================================================
+
+# Exit script if any command fails
+set -e 
+set -o pipefail
+
+# Get current directory
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+
+# Check Ubuntu Version
+if [ `lsb_release -d` != "18.04" ] || [ `lsb_release -d` != "20.04" ]; then
+    echo "This script does not support your ubuntu Version. Please install manually. Further informations can be found here:"
+    echo "https://github.com/TadasBaltrusaitis/OpenFace/wiki/Unix-Installation"
+    exit 1
+fi
+
+
+# OpenFace installation
+echo "Downloading OpenFace"
+git clone https://github.com/TadasBaltrusaitis/OpenFace.git
+cd OpenFace
+rm -rf CMakeLists.txt
+cd ../..
+cp CMakeLists.txt installations/OpenFace
+cd installations/OpenFace
+echo "Installing OpenFace..."
+mkdir -p build
+cd build
+cmake -D CMAKE_CXX_COMPILER=g++-8 -D CMAKE_C_COMPILER=gcc-8 -D CMAKE_BUILD_TYPE=RELEASE ..
+make
+
+./download_models.sh
+cp lib/local/LandmarkDetector/model/patch_experts/cen_* build/bin/model/patch_experts/
+
+cd ../..
+echo "OpenFace successfully installed."
+
+
diff --git a/tools/installers/install_pesq.sh b/tools/installers/install_pesq.sh
index 29677c5c32e..5e707e9151d 100755
--- a/tools/installers/install_pesq.sh
+++ b/tools/installers/install_pesq.sh
@@ -9,7 +9,7 @@ fi
 
 if [ ! -e PESQ.zip ]; then
     wget --tries=3 --no-check-certificate \
-        'http://www.itu.int/rec/dologin_pub.asp?lang=e&id=T-REC-P.862-200511-I!Amd2!SOFT-ZST-E&type=items' -O PESQ.zip
+        'https://github.com/LiChenda/itu_pesq/raw/main/T-REC-P.862-200511.zip' -O PESQ.zip
 fi
 if [ ! -e PESQ ]; then
     mkdir -p PESQ_P.862.2
diff --git a/tools/installers/install_vidaug.sh b/tools/installers/install_vidaug.sh
new file mode 100755
index 00000000000..71b27a95b2a
--- /dev/null
+++ b/tools/installers/install_vidaug.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+#==============================================================================
+# Title: install_espnet.sh
+# Description: Install everything necessary for ESPnet to compile. 
+# Will install all required dependencies, only use if you do not have the dependencies
+# Author: Fabian Hörst
+# Github Vidaug: https://github.com/okankop/vidaug
+# Date: 2021-07-19
+# Version : 1.0
+# Usage: bash install_vidaug.sh PATH_TO_ESPNET_MAIN FOLDER, please use just for ubuntu 18.04 or 20.04
+#==============================================================================
+
+# Get ESPNET Path, e.g. "/home/fabian/AVSR/espnet" from parameter handover
+ESPNET=$1 
+. "${ESPNET}"/tools/activate_python.sh
+
+# Install required packages
+pip3 install numpy
+pip3 install scipy
+pip3 install scikit-image
+pip3 install pillow
+
+git clone https://github.com/okankop/vidaug
+cd vidaug
+python3 setup.py sdist && pip3 install dist/vidaug-0.1.tar.gz
+